From 395b86d7d6f619fb8dd176a4d63f9c1755e189df Mon Sep 17 00:00:00 2001
From: Zephyr <zephyr_xf@qq.com>
Date: Wed, 13 Dec 2023 16:08:02 +0800
Subject: [PATCH 1/7] Dev 1.x Adapt PETR to mmcv v2 version dataset  (#2800)

---
 projects/PETR/README.md         | 4 ++--
 projects/PETR/petr/petr_head.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/projects/PETR/README.md b/projects/PETR/README.md
index 08e1e10fef..dcf7182ed2 100644
--- a/projects/PETR/README.md
+++ b/projects/PETR/README.md
@@ -16,7 +16,7 @@ This is an implementation of *PETR*.
 In MMDet3D's root directory, run the following command to train the model:
 
 ```bash
-python tools/train.py projects/PETR/config/petr/petr_vovnet_gridmask_p4_800x320.py
+python tools/train.py projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py
 ```
 
 ### Testing commands
@@ -24,7 +24,7 @@ python tools/train.py projects/PETR/config/petr/petr_vovnet_gridmask_p4_800x320.
 In MMDet3D's root directory, run the following command to test the model:
 
 ```bash
-python tools/test.py projects/PETR/config/petr/petr_vovnet_gridmask_p4_800x320.py ${CHECKPOINT_PATH}
+python tools/test.py projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py ${CHECKPOINT_PATH}
 ```
 
 ## Results
diff --git a/projects/PETR/petr/petr_head.py b/projects/PETR/petr/petr_head.py
index 2b6e088e57..acebd627f4 100644
--- a/projects/PETR/petr/petr_head.py
+++ b/projects/PETR/petr/petr_head.py
@@ -446,7 +446,7 @@ def forward(self, mlvl_feats, img_metas):
         masks = x.new_ones((batch_size, num_cams, input_img_h, input_img_w))
         for img_id in range(batch_size):
             for cam_id in range(num_cams):
-                img_h, img_w, _ = img_metas[img_id]['img_shape'][cam_id]
+                img_h, img_w = img_metas[img_id]['img_shape'][cam_id]
                 masks[img_id, cam_id, :img_h, :img_w] = 0
         x = self.input_proj(x.flatten(0, 1))
         x = x.view(batch_size, num_cams, *x.shape[-3:])

From 5b88c7b8352f6cddf3cfdc47fa37e0fa0812bacc Mon Sep 17 00:00:00 2001
From: Sun Jiahao <72679458+sunjiahao1999@users.noreply.github.com>
Date: Wed, 27 Dec 2023 17:15:55 +0800
Subject: [PATCH 2/7] [Refactor] Refactor Waymo
 dataset_converter/dataset/evaluator (#2836)

Co-authored-by: sjh <sunjiahao1999>
---
 .gitignore                                    |   1 +
 configs/_base_/datasets/waymoD5-3d-3class.py  |  17 +-
 configs/_base_/datasets/waymoD5-3d-car.py     |  15 +-
 docs/en/advanced_guides/datasets/waymo.md     |  70 +-
 docs/zh_cn/advanced_guides/datasets/waymo.md  |  66 +-
 mmdet3d/datasets/det3d_dataset.py             |  17 +-
 mmdet3d/datasets/waymo_dataset.py             | 117 ++-
 mmdet3d/engine/hooks/visualization_hook.py    |   4 +-
 .../waymo_utils/prediction_to_waymo.py        | 372 ++--------
 mmdet3d/evaluation/metrics/waymo_metric.py    | 637 ++++------------
 ...-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py |  13 +-
 .../waymo/kitti_format/waymo_infos_train.pkl  | Bin 3445 -> 2020 bytes
 .../waymo/kitti_format/waymo_infos_val.pkl    | Bin 7000 -> 2020 bytes
 tests/test_datasets/test_waymo_dataset.py     |  80 ++
 tools/create_data.py                          | 134 ++--
 tools/create_data.sh                          |   7 +-
 .../dataset_converters/create_gt_database.py  |  28 +-
 tools/dataset_converters/waymo_converter.py   | 697 ++++++++++--------
 18 files changed, 990 insertions(+), 1285 deletions(-)
 create mode 100644 tests/test_datasets/test_waymo_dataset.py

diff --git a/.gitignore b/.gitignore
index 27cb9c7cb4..2fefc6a904 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,4 @@ data/sunrgbd/OFFICIAL_SUNRGBD/
 # Waymo evaluation
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_metrics_main
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_let_metrics_main
+mmdet3d/evaluation/functional/waymo_utils/compute_segmentation_metrics_main
diff --git a/configs/_base_/datasets/waymoD5-3d-3class.py b/configs/_base_/datasets/waymoD5-3d-3class.py
index e5240b629e..f8f14998d2 100644
--- a/configs/_base_/datasets/waymoD5-3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -89,7 +89,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 # construct a pipeline for data and gt loading in show function
 # please keep its loading function consistent with test_pipeline (e.g. client)
@@ -100,7 +103,10 @@
         load_dim=6,
         use_dim=5,
         backend_args=backend_args),
-    dict(type='Pack3DDetInputs', keys=['points']),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 
 train_dataloader = dict(
@@ -164,12 +170,7 @@
         backend_args=backend_args))
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False)
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/configs/_base_/datasets/waymoD5-3d-car.py b/configs/_base_/datasets/waymoD5-3d-car.py
index f95ac1d817..972e9289be 100644
--- a/configs/_base_/datasets/waymoD5-3d-car.py
+++ b/configs/_base_/datasets/waymoD5-3d-car.py
@@ -62,7 +62,8 @@
     dict(type='PointShuffle'),
     dict(
         type='Pack3DDetInputs',
-        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 test_pipeline = [
     dict(
@@ -86,7 +87,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 # construct a pipeline for data and gt loading in show function
 # please keep its loading function consistent with test_pipeline (e.g. client)
@@ -161,12 +165,7 @@
         backend_args=backend_args))
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    convert_kitti_format=False,
-    backend_args=backend_args)
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/docs/en/advanced_guides/datasets/waymo.md b/docs/en/advanced_guides/datasets/waymo.md
index 2e52b9dd10..f28ca253b8 100644
--- a/docs/en/advanced_guides/datasets/waymo.md
+++ b/docs/en/advanced_guides/datasets/waymo.md
@@ -7,12 +7,7 @@ This page provides specific tutorials about the usage of MMDetection3D for Waymo
 Before preparing Waymo dataset, if you only installed requirements in `requirements/build.txt` and `requirements/runtime.txt` before, please install the official package for this dataset at first by running
 
 ```
-# tf 2.1.0.
-pip install waymo-open-dataset-tf-2-1-0==1.2.0
-# tf 2.0.0
-# pip install waymo-open-dataset-tf-2-0-0==1.2.0
-# tf 1.15.0
-# pip install waymo-open-dataset-tf-1-15-0==1.2.0
+pip install waymo-open-dataset-tf-2-6-0
 ```
 
 or
@@ -38,15 +33,19 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 
 ```
 
-You can download Waymo open dataset V1.2 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split txt files into `data/waymo/kitti_format/ImageSets`. Download ground truth bin files for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare Waymo data by running
+You can download Waymo open dataset V1.4 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split txt files into `data/waymo/kitti_format/ImageSets`. Download ground truth bin files for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare Waymo data by running
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 Note that if your local disk does not have enough space for saving converted data, you can change the `--out-dir` to anywhere else. Just remember to create folders and prepare data there in advance and link them back to `data/waymo/kitti_format` after the data conversion.
@@ -65,22 +64,16 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 │   │   │   ├── training
-│   │   │   │   ├── calib
 │   │   │   │   ├── image_0
 │   │   │   │   ├── image_1
 │   │   │   │   ├── image_2
 │   │   │   │   ├── image_3
 │   │   │   │   ├── image_4
-│   │   │   │   ├── label_0
-│   │   │   │   ├── label_1
-│   │   │   │   ├── label_2
-│   │   │   │   ├── label_3
-│   │   │   │   ├── label_4
-│   │   │   │   ├── label_all
-│   │   │   │   ├── pose
 │   │   │   │   ├── velodyne
 │   │   │   ├── testing
 │   │   │   │   ├── (the same as training)
@@ -93,7 +86,48 @@ mmdetection3d
 
 ```
 
-Here because there are several cameras, we store the corresponding image and labels that can be projected to that camera respectively and save pose for further usage of consecutive frames point clouds. We use a coding way `{a}{bbb}{ccc}` to name the data for each frame, where `a` is the prefix for different split (`0` for training, `1` for validation and `2` for testing), `bbb` for segment index and `ccc` for frame index. You can easily locate the required frame according to this naming rule. We gather the data for training and validation together as KITTI and store the indices for different set in the `ImageSet` files.
+- `kitti_format/training/image_{0-4}/{a}{bbb}{ccc}.jpg` Here because there are several cameras, we store the corresponding images. We use a coding way `{a}{bbb}{ccc}` to name the data for each frame, where `a` is the prefix for different split (`0` for training, `1` for validation and `2` for testing), `bbb` for segment index and `ccc` for frame index. You can easily locate the required frame according to this naming rule. We gather the data for training and validation together as KITTI and store the indices for different set in the `ImageSet` files.
+- `kitti_format/training/velodyne/{a}{bbb}{ccc}.bin` point cloud data for each frame.
+- `kitti_format/waymo_gt_database/xxx_{Car/Pedestrian/Cyclist}_x.bin`. point cloud data included in each 3D bounding box of the training dataset. These point clouds will be used in data augmentation e.g. `ObjectSample`. `xxx` is the index of training samples and `x` is the index of objects in this frame.
+- `kitti_format/waymo_infos_train.pkl`. training dataset information, a dict contains two keys: `metainfo` and `data_list`.`metainfo` contains the basic information for the dataset itself, such as `dataset`, `version` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list).
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['context_name'\]: The context name of sample indices which `*.tfrecord` segment it extracted from.
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+  - info\['lidar_sweeps'\]: A list contains sweeps information of lidar
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar_path'\]: The lidar data path of i-th sweep.
+    - info\['lidar_sweeps'\]\[i\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+  - info\['images'\]: A dict contains five keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. Each dict contains all data information related to  corresponding camera.
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM_XXX'\]\['height'\]: The height of the image.
+    - info\['images'\]\['CAM_XXX'\]\['width'\]: The width of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2img'\]: The transformation matrix from lidar sensor to each image plane. (4x4 list)
+  - info\['image_sweeps'\]: A list containing sweeps information of images.
+    - info\['image_sweeps'\]\[i\]\['images'\]\['CAM_XXX'\]\['img_path'\]: The image path of i-th sweep.
+    - info\['image_sweeps'\]\[i\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['image_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, w, h, yaw) order.
+    - info\['instances'\]\[i\]\['bbox'\]: List of 4 numbers representing the 2D bounding box of the instance, in (x1, y1, x2, y2) order. (some instances may not have a corresponding 2D bounding box)
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: A int indicating the label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['bbox_label'\]: A int indicating the label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: Number of lidar points included in each 3D bounding box.
+    - info\['instances'\]\[i\]\['camera_id'\]: The index of the most visible camera for this instance.
+    - info\['instances'\]\[i\]\['group_id'\]: The index of this instance in this sample.
+  - info\['cam_sync_instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. Its format is same with \['instances'\]. However, \['cam_sync_instances'\] is only for multi-view camera-based 3D Object Detection task.
+  - info\['cam_instances'\]: It is a dict containing keys `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. For monocular camera-based 3D Object Detection task, we split 3D annotations of the whole scenes according to the camera they belong to. For the i-th instance:
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D bounding box annotation (exterior rectangle of the projected 3D box), a list arrange as \[x1, y1, x2, y2\].
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: Projected center location on the image, a list has shape (2,).
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: The depth of projected center.
 
 ## Training
 
@@ -101,7 +135,7 @@ Considering there are many similar frames in the original dataset, we can basica
 
 ## Evaluation
 
-For evaluation on Waymo, please follow the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md) to build the binary file `compute_detection_metrics_main` for metrics computation and put it into `mmdet3d/core/evaluation/waymo_utils/`.  Basically, you can follow the commands below to install `bazel` and build the file.
+For evaluation on Waymo, please follow the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/r1.3/docs/quick_start.md) to build the binary file `compute_detection_metrics_main` for metrics computation and put it into `mmdet3d/core/evaluation/waymo_utils/`.  Basically, you can follow the commands below to install `bazel` and build the file.
 
 ```shell
 # download the code and enter the base directory
diff --git a/docs/zh_cn/advanced_guides/datasets/waymo.md b/docs/zh_cn/advanced_guides/datasets/waymo.md
index 577ec1513a..8c0f0dfc0f 100644
--- a/docs/zh_cn/advanced_guides/datasets/waymo.md
+++ b/docs/zh_cn/advanced_guides/datasets/waymo.md
@@ -7,12 +7,7 @@
 在准备 Waymo 数据集之前，如果您之前只安装了 `requirements/build.txt` 和 `requirements/runtime.txt` 中的依赖，请通过运行如下指令额外安装 Waymo 数据集所依赖的官方包：
 
 ```
-# tf 2.1.0.
-pip install waymo-open-dataset-tf-2-1-0==1.2.0
-# tf 2.0.0
-# pip install waymo-open-dataset-tf-2-0-0==1.2.0
-# tf 1.15.0
-# pip install waymo-open-dataset-tf-1-15-0==1.2.0
+pip install waymo-open-dataset-tf-2-6-0
 ```
 
 或者
@@ -38,6 +33,8 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 
@@ -46,7 +43,9 @@ mmdetection3d
 您可以在[这里](https://waymo.com/open/download/)下载 1.2 版本的 Waymo 公开数据集，并在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其训练/验证/测试集拆分文件。接下来，请将 `tfrecord` 文件放入 `data/waymo/waymo_format/` 下的对应文件夹，并将 txt 格式的数据集拆分文件放入 `data/waymo/kitti_format/ImageSets`。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集使用的 bin 格式真实标注 (Ground Truth) 文件并放入 `data/waymo/waymo_format/`。小窍门：您可以使用 `gsutil` 来在命令行下载大规模数据集。您可以将该[工具](https://github.com/RalphMao/Waymo-Dataset-Tool) 作为一个例子来查看更多细节。之后，通过运行如下指令准备 Waymo 数据：
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 请注意，如果您的本地磁盘没有足够空间保存转换后的数据，您可以将 `--out-dir` 改为其他目录；只要在创建文件夹、准备数据并转换格式后，将数据文件链接到 `data/waymo/kitti_format` 即可。
@@ -65,22 +64,16 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 │   │   │   ├── training
-│   │   │   │   ├── calib
 │   │   │   │   ├── image_0
 │   │   │   │   ├── image_1
 │   │   │   │   ├── image_2
 │   │   │   │   ├── image_3
 │   │   │   │   ├── image_4
-│   │   │   │   ├── label_0
-│   │   │   │   ├── label_1
-│   │   │   │   ├── label_2
-│   │   │   │   ├── label_3
-│   │   │   │   ├── label_4
-│   │   │   │   ├── label_all
-│   │   │   │   ├── pose
 │   │   │   │   ├── velodyne
 │   │   │   ├── testing
 │   │   │   │   ├── (the same as training)
@@ -93,7 +86,48 @@ mmdetection3d
 
 ```
 
-因为 Waymo 数据的来源包含数个相机，这里我们将每个相机对应的图像和标签文件分别存储，并将相机位姿 (pose) 文件存储下来以供后续处理连续多帧的点云。我们使用 `{a}{bbb}{ccc}` 的名称编码方式为每帧数据命名，其中 `a` 是不同数据拆分的前缀（`0` 指代训练集，`1` 指代验证集，`2` 指代测试集），`bbb` 是分割部分 (segment) 的索引，而 `ccc` 是帧索引。您可以轻而易举地按照如上命名规则定位到所需的帧。我们将训练和验证所需数据按 KITTI 的方式集合在一起，然后将训练集/验证集/测试集的索引存储在 `ImageSet` 下的文件中。
+- `kitti_format/training/image_{0-4}/{a}{bbb}{ccc}.jpg` 因为 Waymo 数据的来源包含数个相机，这里我们将每个相机对应的图像和标签文件分别存储，并将相机位姿 (pose) 文件存储下来以供后续处理连续多帧的点云。我们使用 `{a}{bbb}{ccc}` 的名称编码方式为每帧数据命名，其中 `a` 是不同数据拆分的前缀（`0` 指代训练集，`1` 指代验证集，`2` 指代测试集），`bbb` 是分割部分 (segment) 的索引，而 `ccc` 是帧索引。您可以轻而易举地按照如上命名规则定位到所需的帧。我们将训练和验证所需数据按 KITTI 的方式集合在一起，然后将训练集/验证集/测试集的索引存储在 `ImageSet` 下的文件中。
+- `kitti_format/training/velodyne/{a}{bbb}{ccc}.bin` 当前样本的点云数据
+- `kitti_format/waymo_gt_database/xxx_{Car/Pedestrian/Cyclist}_x.bin`. 训练数据集的每个 3D 包围框中包含的点云数据。这些点云会在数据增强中被使用，例如. `ObjectSample`. `xxx` 表示训练样本的索引，`x` 表示实例在当前样本中的索引。
+- `kitti_format/waymo_infos_train.pkl`. 训练数据集，该字典包含了两个键值：`metainfo` 和 `data_list`。`metainfo` 包含数据集的基本信息，例如 `dataset`、`version` 和 `info_version`。`data_list` 是由字典组成的列表，每个字典（以下简称 `info`）包含了单个样本的所有详细信息。:
+  - info\['sample_idx'\]: 样本在整个数据集的索引。
+  - info\['ego2global'\]: 自车到全局坐标的变换矩阵。（4x4 列表）
+  - info\['timestamp'\]：样本数据时间戳。
+  - info\['context_name'\]: 语境名，表示样本从哪个 `*.tfrecord` 片段中提取的。
+  - info\['lidar_points'\]: 是一个字典，包含了所有与激光雷达点相关的信息。
+    - info\['lidar_points'\]\['lidar_path'\]: 激光雷达点云数据的文件名。
+    - info\['lidar_points'\]\['num_pts_feats'\]: 点的特征维度。
+  - info\['lidar_sweeps'\]: 是一个列表，包含了历史帧信息。
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar_path'\]: 第 i 帧的激光雷达数据的文件路径。
+    - info\['lidar_sweeps'\]\[i\]\['ego2global'\]: 第 i 帧的激光雷达传感器到自车的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: 第 i 帧的样本数据时间戳。
+  - info\['images'\]: 是一个字典，包含与每个相机对应的六个键值：`'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`。每个字典包含了对应相机的所有数据信息。
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: 图像的文件名。
+    - info\['images'\]\['CAM_XXX'\]\['height'\]: 图像的高
+    - info\['images'\]\['CAM_XXX'\]\['width'\]: 图像的宽
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: 当 3D 点投影到图像平面时需要的内参信息相关的变换矩阵。（3x3 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: 激光雷达传感器到该相机的变换矩阵。（4x4 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2img'\]: 激光雷达传感器到图像平面的变换矩阵。（4x4 列表）
+  - info\['image_sweeps'\]: 是一个列表，包含了历史帧信息。
+    - info\['image_sweeps'\]\[i\]\['images'\]\['CAM_XXX'\]\['img_path'\]: 第i帧的图像的文件名.
+    - info\['image_sweeps'\]\[i\]\['ego2global'\]: 第 i 帧的自车到全局坐标的变换矩阵。（4x4 列表）
+    - info\['image_sweeps'\]\[i\]\['timestamp'\]: 第 i 帧的样本数据时间戳。
+  - info\['instances'\]: 是一个字典组成的列表。每个字典包含单个实例的所有标注信息。对于其中的第 i 个实例，我们有：
+    - info\['instances'\]\[i\]\['bbox_3d'\]: 长度为 7 的列表，以 (x, y, z, l, w, h, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['instances'\]\[i\]\['bbox'\]: 2D 边界框标注（，顺序为 \[x1, y1, x2, y2\] 的列表。有些实例可能没有对应的 2D 边界框标注。
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: 整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['bbox_label'\]: 整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: 每个 3D 边界框内包含的激光雷达点数。
+    - info\['instances'\]\[i\]\['camera_id'\]: 当前实例最可见相机的索引。
+    - info\['instances'\]\[i\]\['group_id'\]: 当前实例在当前样本中的索引。
+  - info\['cam_sync_instances'\]: 是一个字典组成的列表。每个字典包含单个实例的所有标注信息。它的形式与 \['instances'\]相同. 但是, \['cam_sync_instances'\] 专门用于基于多视角相机的三维目标检测任务。
+  - info\['cam_instances'\]: 是一个字典，包含以下键值： `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. 对于基于视觉的 3D 目标检测任务，我们将整个场景的 3D 标注划分至它们所属于的相应相机中。对于其中的第 i 个实例，我们有：
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: 长度为 7 的列表，以 (x, y, z, l, h, w, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D 边界框标注（3D 框投影的矩形框），顺序为 \[x1, y1, x2, y2\] 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: 实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: 实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: 3D 框投影到图像上的中心点，大小为 (2, ) 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: 3D 框投影中心的深度。
 
 ## 训练
 
diff --git a/mmdet3d/datasets/det3d_dataset.py b/mmdet3d/datasets/det3d_dataset.py
index 11caae4729..c701a893fd 100644
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -113,7 +113,7 @@ def __init__(self,
                 ori_label = self.METAINFO['classes'].index(name)
                 self.label_mapping[ori_label] = label_idx
 
-            self.num_ins_per_cat = {name: 0 for name in metainfo['classes']}
+            self.num_ins_per_cat = [0] * len(metainfo['classes'])
         else:
             self.label_mapping = {
                 i: i
@@ -121,10 +121,7 @@ def __init__(self,
             }
             self.label_mapping[-1] = -1
 
-            self.num_ins_per_cat = {
-                name: 0
-                for name in self.METAINFO['classes']
-            }
+            self.num_ins_per_cat = [0] * len(self.METAINFO['classes'])
 
         super().__init__(
             ann_file=ann_file,
@@ -146,9 +143,12 @@ def __init__(self,
 
             # show statistics of this dataset
             print_log('-' * 30, 'current')
-            print_log(f'The length of the dataset: {len(self)}', 'current')
+            print_log(
+                f'The length of {"test" if self.test_mode else "training"} dataset: {len(self)}',  # noqa: E501
+                'current')
             content_show = [['category', 'number']]
-            for cat_name, num in self.num_ins_per_cat.items():
+            for label, num in enumerate(self.num_ins_per_cat):
+                cat_name = self.metainfo['classes'][label]
                 content_show.append([cat_name, num])
             table = AsciiTable(content_show)
             print_log(
@@ -256,8 +256,7 @@ def parse_ann_info(self, info: dict) -> Union[dict, None]:
 
             for label in ann_info['gt_labels_3d']:
                 if label != -1:
-                    cat_name = self.metainfo['classes'][label]
-                    self.num_ins_per_cat[cat_name] += 1
+                    self.num_ins_per_cat[label] += 1
 
         return ann_info
 
diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index 5b3a83824e..cda27e42e5 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -3,9 +3,11 @@
 from typing import Callable, List, Union
 
 import numpy as np
+from mmengine import print_log
+from mmengine.fileio import load
 
 from mmdet3d.registry import DATASETS
-from mmdet3d.structures import CameraInstance3DBoxes
+from mmdet3d.structures import CameraInstance3DBoxes, LiDARInstance3DBoxes
 from .det3d_dataset import Det3DDataset
 from .kitti_dataset import KittiDataset
 
@@ -163,13 +165,10 @@ def parse_ann_info(self, info: dict) -> dict:
             centers_2d = np.zeros((0, 2), dtype=np.float32)
             depths = np.zeros((0), dtype=np.float32)
 
-        # in waymo, lidar2cam = R0_rect @ Tr_velo_to_cam
-        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
-        lidar2cam = np.array(info['images'][self.default_cam_key]['lidar2cam'])
-        gt_bboxes_3d = CameraInstance3DBoxes(
-            ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
-                                                 np.linalg.inv(lidar2cam))
-        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        if self.load_type == 'frame_based':
+            gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
+        else:
+            gt_bboxes_3d = CameraInstance3DBoxes(ann_info['gt_bboxes_3d'])
 
         anns_results = dict(
             gt_bboxes_3d=gt_bboxes_3d,
@@ -182,9 +181,58 @@ def parse_ann_info(self, info: dict) -> dict:
         return anns_results
 
     def load_data_list(self) -> List[dict]:
-        """Add the load interval."""
-        data_list = super().load_data_list()
-        data_list = data_list[::self.load_interval]
+        """Add the load interval.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """  # noqa: E501
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations or 'metainfo' not in annotations:
+            raise ValueError('Annotation must have data_list and metainfo '
+                             'keys')
+        metainfo = annotations['metainfo']
+        raw_data_list = annotations['data_list']
+        raw_data_list = raw_data_list[::self.load_interval]
+        if self.load_interval > 1:
+            print_log(
+                f'Sample size will be reduced to 1/{self.load_interval} of'
+                ' the original data sample',
+                logger='current')
+
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        # load and parse data_infos.
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # parse raw data information to target format
+            data_info = self.parse_data_info(raw_data_info)
+            if isinstance(data_info, dict):
+                # For image tasks, `data_info` should information if single
+                # image, such as dict(img_path='xxx', width=360, ...)
+                data_list.append(data_info)
+            elif isinstance(data_info, list):
+                # For video tasks, `data_info` could contain image
+                # information of multiple frames, such as
+                # [dict(video_path='xxx', timestamps=...),
+                #  dict(video_path='xxx', timestamps=...)]
+                for item in data_info:
+                    if not isinstance(item, dict):
+                        raise TypeError('data_info must be list of dict, but '
+                                        f'got {type(item)}')
+                data_list.extend(data_info)
+            else:
+                raise TypeError('data_info should be a dict or list of dict, '
+                                f'but got {type(data_info)}')
+
         return data_list
 
     def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
@@ -203,44 +251,39 @@ def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
                 info['images'][self.default_cam_key]
             info['images'] = new_image_info
             info['instances'] = info['cam_instances'][self.default_cam_key]
-            return super().parse_data_info(info)
+            return Det3DDataset.parse_data_info(self, info)
         else:
             # in the mono3d, the instances is from cam sync.
+            # Convert frame-based infos to multi-view image-based
             data_list = []
-            if self.modality['use_lidar']:
-                info['lidar_points']['lidar_path'] =  \
-                    osp.join(
-                        self.data_prefix.get('pts', ''),
-                        info['lidar_points']['lidar_path'])
-
-            if self.modality['use_camera']:
-                for cam_key, img_info in info['images'].items():
-                    if 'img_path' in img_info:
-                        cam_prefix = self.data_prefix.get(cam_key, '')
-                        img_info['img_path'] = osp.join(
-                            cam_prefix, img_info['img_path'])
-
             for (cam_key, img_info) in info['images'].items():
                 camera_info = dict()
+                camera_info['sample_idx'] = info['sample_idx']
+                camera_info['timestamp'] = info['timestamp']
+                camera_info['context_name'] = info['context_name']
                 camera_info['images'] = dict()
                 camera_info['images'][cam_key] = img_info
-                if 'cam_instances' in info \
-                        and cam_key in info['cam_instances']:
-                    camera_info['instances'] = info['cam_instances'][cam_key]
+                if 'img_path' in img_info:
+                    cam_prefix = self.data_prefix.get(cam_key, '')
+                    camera_info['images'][cam_key]['img_path'] = osp.join(
+                        cam_prefix, img_info['img_path'])
+                if 'lidar2cam' in img_info:
+                    camera_info['lidar2cam'] = np.array(img_info['lidar2cam'])
+                if 'cam2img' in img_info:
+                    camera_info['cam2img'] = np.array(img_info['cam2img'])
+                if 'lidar2img' in img_info:
+                    camera_info['lidar2img'] = np.array(img_info['lidar2img'])
                 else:
-                    camera_info['instances'] = []
-                camera_info['ego2global'] = info['ego2global']
-                if 'image_sweeps' in info:
-                    camera_info['image_sweeps'] = info['image_sweeps']
-
-                # TODO check if need to modify the sample id
-                # TODO check when will use it except for evaluation.
-                camera_info['sample_idx'] = info['sample_idx']
+                    camera_info['lidar2img'] = camera_info[
+                        'cam2img'] @ camera_info['lidar2cam']
 
                 if not self.test_mode:
                     # used in training
+                    camera_info['instances'] = info['cam_instances'][cam_key]
                     camera_info['ann_info'] = self.parse_ann_info(camera_info)
                 if self.test_mode and self.load_eval_anns:
-                    info['eval_ann_info'] = self.parse_ann_info(info)
+                    camera_info['instances'] = info['cam_instances'][cam_key]
+                    camera_info['eval_ann_info'] = self.parse_ann_info(
+                        camera_info)
                 data_list.append(camera_info)
             return data_list
diff --git a/mmdet3d/engine/hooks/visualization_hook.py b/mmdet3d/engine/hooks/visualization_hook.py
index ffec1addc3..9de46d9692 100644
--- a/mmdet3d/engine/hooks/visualization_hook.py
+++ b/mmdet3d/engine/hooks/visualization_hook.py
@@ -78,11 +78,11 @@ def __init__(self,
                           'needs to be excluded.')
         self.vis_task = vis_task
 
-        if wait_time == -1:
+        if show and wait_time == -1:
             print_log(
                 'Manual control mode, press [Right] to next sample.',
                 logger='current')
-        else:
+        elif show:
             print_log(
                 'Autoplay mode, press [SPACE] to pause.', logger='current')
         self.wait_time = wait_time
diff --git a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
index b9da8043d2..3c79d6f6cb 100644
--- a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
@@ -4,7 +4,6 @@
 """
 
 try:
-    from waymo_open_dataset import dataset_pb2 as open_dataset
     from waymo_open_dataset import label_pb2
     from waymo_open_dataset.protos import metrics_pb2
     from waymo_open_dataset.protos.metrics_pb2 import Objects
@@ -14,13 +13,10 @@
         'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
         'to install the official devkit first.')
 
-from glob import glob
-from os.path import join
-from typing import List, Optional
+from typing import List
 
 import mmengine
-import numpy as np
-import tensorflow as tf
+from mmengine import print_log
 
 
 class Prediction2Waymo(object):
@@ -32,54 +28,22 @@ class Prediction2Waymo(object):
 
     Args:
         results (list[dict]): Prediction results.
-        waymo_tfrecords_dir (str): Directory to load waymo raw data.
         waymo_results_save_dir (str): Directory to save converted predictions
             in waymo format (.bin files).
         waymo_results_final_path (str): Path to save combined
             predictions in waymo format (.bin file), like 'a/b/c.bin'.
-        prefix (str): Prefix of filename. In general, 0 for training, 1 for
-            validation and 2 for testing.
-        classes (dict): A list of class name.
-        workers (str): Number of parallel processes. Defaults to 2.
-        backend_args (dict, optional): Arguments to instantiate the
-            corresponding backend. Defaults to None.
-        from_kitti_format (bool, optional): Whether the reuslts are kitti
-            format. Defaults to False.
-        idx2metainfo (Optional[dict], optional): The mapping from sample_idx to
-            metainfo. The metainfo must contain the keys: 'idx2contextname' and
-            'idx2timestamp'. Defaults to None.
+        num_workers (str): Number of parallel processes. Defaults to 4.
     """
 
     def __init__(self,
                  results: List[dict],
-                 waymo_tfrecords_dir: str,
-                 waymo_results_save_dir: str,
                  waymo_results_final_path: str,
-                 prefix: str,
                  classes: dict,
-                 workers: int = 2,
-                 backend_args: Optional[dict] = None,
-                 from_kitti_format: bool = False,
-                 idx2metainfo: Optional[dict] = None):
-
+                 num_workers: int = 4):
         self.results = results
-        self.waymo_tfrecords_dir = waymo_tfrecords_dir
-        self.waymo_results_save_dir = waymo_results_save_dir
         self.waymo_results_final_path = waymo_results_final_path
-        self.prefix = prefix
         self.classes = classes
-        self.workers = int(workers)
-        self.backend_args = backend_args
-        self.from_kitti_format = from_kitti_format
-        if idx2metainfo is not None:
-            self.idx2metainfo = idx2metainfo
-            # If ``fast_eval``, the metainfo does not need to be read from
-            # original data online. It's preprocessed offline.
-            self.fast_eval = True
-        else:
-            self.fast_eval = False
-
-        self.name2idx = {}
+        self.num_workers = num_workers
 
         self.k2w_cls_map = {
             'Car': label_pb2.Label.TYPE_VEHICLE,
@@ -88,213 +52,23 @@ def __init__(self,
             'Cyclist': label_pb2.Label.TYPE_CYCLIST,
         }
 
-        if self.from_kitti_format:
-            self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
-                                                [-1.0, 0.0, 0.0, 0.0],
-                                                [0.0, -1.0, 0.0, 0.0],
-                                                [0.0, 0.0, 0.0, 1.0]])
-            # ``sample_idx`` of the sample in kitti-format is an array
-            for idx, result in enumerate(results):
-                if len(result['sample_idx']) > 0:
-                    self.name2idx[str(result['sample_idx'][0])] = idx
-        else:
-            # ``sample_idx`` of the sample in the original prediction
-            # is an int value.
-            for idx, result in enumerate(results):
-                self.name2idx[str(result['sample_idx'])] = idx
-
-        if not self.fast_eval:
-            # need to read original '.tfrecord' file
-            self.get_file_names()
-            # turn on eager execution for older tensorflow versions
-            if int(tf.__version__.split('.')[0]) < 2:
-                tf.enable_eager_execution()
-
-        self.create_folder()
-
-    def get_file_names(self):
-        """Get file names of waymo raw data."""
-        if 'path_mapping' in self.backend_args:
-            for path in self.backend_args['path_mapping'].keys():
-                if path in self.waymo_tfrecords_dir:
-                    self.waymo_tfrecords_dir = \
-                        self.waymo_tfrecords_dir.replace(
-                            path, self.backend_args['path_mapping'][path])
-            from petrel_client.client import Client
-            client = Client()
-            contents = client.list(self.waymo_tfrecords_dir)
-            self.waymo_tfrecord_pathnames = list()
-            for content in sorted(list(contents)):
-                if content.endswith('tfrecord'):
-                    self.waymo_tfrecord_pathnames.append(
-                        join(self.waymo_tfrecords_dir, content))
-        else:
-            self.waymo_tfrecord_pathnames = sorted(
-                glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
-        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
-
-    def create_folder(self):
-        """Create folder for data conversion."""
-        mmengine.mkdir_or_exist(self.waymo_results_save_dir)
-
-    def parse_objects(self, kitti_result, T_k2w, context_name,
-                      frame_timestamp_micros):
-        """Parse one prediction with several instances in kitti format and
-        convert them to `Object` proto.
-
-        Args:
-            kitti_result (dict): Predictions in kitti format.
-
-                - name (np.ndarray): Class labels of predictions.
-                - dimensions (np.ndarray): Height, width, length of boxes.
-                - location (np.ndarray): Bottom center of boxes (x, y, z).
-                - rotation_y (np.ndarray): Orientation of boxes.
-                - score (np.ndarray): Scores of predictions.
-            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
-            context_name (str): Context name of the frame.
-            frame_timestamp_micros (int): Frame timestamp.
-
-        Returns:
-            :obj:`Object`: Predictions in waymo dataset Object proto.
-        """
-
-        def parse_one_object(instance_idx):
-            """Parse one instance in kitti format and convert them to `Object`
-            proto.
-
-            Args:
-                instance_idx (int): Index of the instance to be converted.
-
-            Returns:
-                :obj:`Object`: Predicted instance in waymo dataset
-                    Object proto.
-            """
-            cls = kitti_result['name'][instance_idx]
-            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
-            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
-            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
-            x = round(kitti_result['location'][instance_idx, 0], 4)
-            y = round(kitti_result['location'][instance_idx, 1], 4)
-            z = round(kitti_result['location'][instance_idx, 2], 4)
-            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
-            score = round(kitti_result['score'][instance_idx], 4)
-
-            # y: downwards; move box origin from bottom center (kitti) to
-            # true center (waymo)
-            y -= height / 2
-            # frame transformation: kitti -> waymo
-            x, y, z = self.transform(T_k2w, x, y, z)
-
-            # different conventions
-            heading = -(rotation_y + np.pi / 2)
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box = label_pb2.Label.Box()
-            box.center_x = x
-            box.center_y = y
-            box.center_z = z
-            box.length = length
-            box.width = width
-            box.height = height
-            box.heading = heading
-
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[cls]
-            o.score = score
-
-            o.context_name = context_name
-            o.frame_timestamp_micros = frame_timestamp_micros
-
-            return o
-
-        objects = metrics_pb2.Objects()
-
-        for instance_idx in range(len(kitti_result['name'])):
-            o = parse_one_object(instance_idx)
-            objects.objects.append(o)
-
-        return objects
-
-    def convert_one(self, file_idx):
-        """Convert action for single file.
-
-        Args:
-            file_idx (int): Index of the file to be converted.
-        """
-        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
-        if 's3://' in file_pathname and tf.__version__ >= '2.6.0':
-            try:
-                import tensorflow_io as tfio  # noqa: F401
-            except ImportError:
-                raise ImportError(
-                    "Please run 'pip install tensorflow-io' to install tensorflow_io first."  # noqa: E501
-                )
-        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
-
-        for frame_num, frame_data in enumerate(file_data):
-            frame = open_dataset.Frame()
-            frame.ParseFromString(bytearray(frame_data.numpy()))
-
-            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
-
-            context_name = frame.context.name
-            frame_timestamp_micros = frame.timestamp_micros
-
-            if filename in self.name2idx:
-                if self.from_kitti_format:
-                    for camera in frame.context.camera_calibrations:
-                        # FRONT = 1, see dataset.proto for details
-                        if camera.name == 1:
-                            T_front_cam_to_vehicle = np.array(
-                                camera.extrinsic.transform).reshape(4, 4)
-
-                    T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
-
-                    kitti_result = \
-                        self.results[self.name2idx[filename]]
-                    objects = self.parse_objects(kitti_result, T_k2w,
-                                                 context_name,
-                                                 frame_timestamp_micros)
-                else:
-                    index = self.name2idx[filename]
-                    objects = self.parse_objects_from_origin(
-                        self.results[index], context_name,
-                        frame_timestamp_micros)
-
-            else:
-                print(filename, 'not found.')
-                objects = metrics_pb2.Objects()
-
-            with open(
-                    join(self.waymo_results_save_dir, f'{filename}.bin'),
-                    'wb') as f:
-                f.write(objects.SerializeToString())
-
-    def convert_one_fast(self, res_index: int):
+    def convert_one(self, res_idx: int):
         """Convert action for single file. It read the metainfo from the
         preprocessed file offline and will be faster.
 
         Args:
-            res_index (int): The indices of the results.
+            res_idx (int): The indices of the results.
         """
-        sample_idx = self.results[res_index]['sample_idx']
-        if len(self.results[res_index]['pred_instances_3d']) > 0:
+        sample_idx = self.results[res_idx]['sample_idx']
+        if len(self.results[res_idx]['labels_3d']) > 0:
             objects = self.parse_objects_from_origin(
-                self.results[res_index],
-                self.idx2metainfo[str(sample_idx)]['contextname'],
-                self.idx2metainfo[str(sample_idx)]['timestamp'])
+                self.results[res_idx], self.results[res_idx]['context_name'],
+                self.results[res_idx]['timestamp'])
         else:
             print(sample_idx, 'not found.')
             objects = metrics_pb2.Objects()
 
-        with open(
-                join(self.waymo_results_save_dir, f'{sample_idx}.bin'),
-                'wb') as f:
-            f.write(objects.SerializeToString())
+        return objects
 
     def parse_objects_from_origin(self, result: dict, contextname: str,
                                   timestamp: str) -> Objects:
@@ -308,112 +82,56 @@ def parse_objects_from_origin(self, result: dict, contextname: str,
         Returns:
             metrics_pb2.Objects: The parsed object.
         """
-        lidar_boxes = result['pred_instances_3d']['bboxes_3d'].tensor
-        scores = result['pred_instances_3d']['scores_3d']
-        labels = result['pred_instances_3d']['labels_3d']
-
-        def parse_one_object(index):
-            class_name = self.classes[labels[index].item()]
+        lidar_boxes = result['bboxes_3d']
+        scores = result['scores_3d']
+        labels = result['labels_3d']
 
+        objects = metrics_pb2.Objects()
+        for lidar_box, score, label in zip(lidar_boxes, scores, labels):
+            # Parse one object
             box = label_pb2.Label.Box()
-            height = lidar_boxes[index][5].item()
-            heading = lidar_boxes[index][6].item()
-
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box.center_x = lidar_boxes[index][0].item()
-            box.center_y = lidar_boxes[index][1].item()
-            box.center_z = lidar_boxes[index][2].item() + height / 2
-            box.length = lidar_boxes[index][3].item()
-            box.width = lidar_boxes[index][4].item()
+            height = lidar_box[5]
+            heading = lidar_box[6]
+
+            box.center_x = lidar_box[0]
+            box.center_y = lidar_box[1]
+            box.center_z = lidar_box[2] + height / 2
+            box.length = lidar_box[3]
+            box.width = lidar_box[4]
             box.height = height
             box.heading = heading
 
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[class_name]
-            o.score = scores[index].item()
-            o.context_name = contextname
-            o.frame_timestamp_micros = timestamp
+            object = metrics_pb2.Object()
+            object.object.box.CopyFrom(box)
 
-            return o
-
-        objects = metrics_pb2.Objects()
-        for i in range(len(lidar_boxes)):
-            objects.objects.append(parse_one_object(i))
+            class_name = self.classes[label]
+            object.object.type = self.k2w_cls_map[class_name]
+            object.score = score
+            object.context_name = contextname
+            object.frame_timestamp_micros = timestamp
+            objects.objects.append(object)
 
         return objects
 
     def convert(self):
         """Convert action."""
-        print('Start converting ...')
-        convert_func = self.convert_one_fast if self.fast_eval else \
-            self.convert_one
+        print_log('Start converting ...', logger='current')
 
-        # from torch.multiprocessing import set_sharing_strategy
-        # # Force using "file_system" sharing strategy for stability
-        # set_sharing_strategy("file_system")
+        # TODO: use parallel processes.
+        # objects_list = mmengine.track_parallel_progress(
+        #     self.convert_one, range(len(self)), self.num_workers)
 
-        # mmengine.track_parallel_progress(convert_func, range(len(self)),
-        #                                  self.workers)
+        objects_list = mmengine.track_progress(self.convert_one,
+                                               range(len(self)))
 
-        # TODO: Support multiprocessing. Now, multiprocessing evaluation will
-        # cause shared memory error in torch-1.10 and torch-1.11. Details can
-        # be seen in https://github.com/pytorch/pytorch/issues/67864.
-        prog_bar = mmengine.ProgressBar(len(self))
-        for i in range(len(self)):
-            convert_func(i)
-            prog_bar.update()
-
-        print('\nFinished ...')
-
-        # combine all files into one .bin
-        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
-        combined = self.combine(pathnames)
+        combined = metrics_pb2.Objects()
+        for objects in objects_list:
+            for o in objects.objects:
+                combined.objects.append(o)
 
         with open(self.waymo_results_final_path, 'wb') as f:
             f.write(combined.SerializeToString())
 
     def __len__(self):
         """Length of the filename list."""
-        return len(self.results) if self.fast_eval else len(
-            self.waymo_tfrecord_pathnames)
-
-    def transform(self, T, x, y, z):
-        """Transform the coordinates with matrix T.
-
-        Args:
-            T (np.ndarray): Transformation matrix.
-            x(float): Coordinate in x axis.
-            y(float): Coordinate in y axis.
-            z(float): Coordinate in z axis.
-
-        Returns:
-            list: Coordinates after transformation.
-        """
-        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
-        pt_aft = np.matmul(T, pt_bef)
-        return pt_aft[:3].flatten().tolist()
-
-    def combine(self, pathnames):
-        """Combine predictions in waymo format for each sample together.
-
-        Args:
-            pathnames (str): Paths to save predictions.
-
-        Returns:
-            :obj:`Objects`: Combined predictions in Objects proto.
-        """
-        combined = metrics_pb2.Objects()
-
-        for pathname in pathnames:
-            objects = metrics_pb2.Objects()
-            with open(pathname, 'rb') as f:
-                objects.ParseFromString(f.read())
-            for o in objects.objects:
-                combined.objects.append(o)
-
-        return combined
+        return len(self.results)
diff --git a/mmdet3d/evaluation/metrics/waymo_metric.py b/mmdet3d/evaluation/metrics/waymo_metric.py
index 0dd69a5c24..cdbc4a58db 100644
--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -1,54 +1,30 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import tempfile
 from os import path as osp
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
-import mmengine
 import numpy as np
 import torch
-from mmengine import Config, load
+from mmengine import Config
+from mmengine.device import get_device
+from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger, print_log
 
 from mmdet3d.models.layers import box3d_multiclass_nms
 from mmdet3d.registry import METRICS
 from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
-                                LiDARInstance3DBoxes, bbox3d2result,
-                                points_cam2img, xywhr2xyxyr)
-from .kitti_metric import KittiMetric
+                                LiDARInstance3DBoxes, points_cam2img,
+                                xywhr2xyxyr)
 
 
 @METRICS.register_module()
-class WaymoMetric(KittiMetric):
+class WaymoMetric(BaseMetric):
     """Waymo evaluation metric.
 
     Args:
-        ann_file (str): The path of the annotation file in kitti format.
         waymo_bin_file (str): The path of the annotation file in waymo format.
-        data_root (str): Path of dataset root. Used for storing waymo
-            evaluation programs.
-        split (str): The split of the evaluation set. Defaults to 'training'.
         metric (str or List[str]): Metrics to be evaluated. Defaults to 'mAP'.
-        pcd_limit_range (List[float]): The range of point cloud used to filter
-            invalid predicted boxes. Defaults to [-85, -85, -5, 85, 85, 5].
-        convert_kitti_format (bool): Whether to convert the results to kitti
-            format. Now, in order to be compatible with camera-based methods,
-            defaults to True.
-        prefix (str, optional): The prefix that will be added in the metric
-            names to disambiguate homonymous metrics of different evaluators.
-            If prefix is not provided in the argument, self.default_prefix will
-            be used instead. Defaults to None.
-        format_only (bool): Format the output results without perform
-            evaluation. It is useful when you want to format the result to a
-            specific format and submit it to the test server.
-            Defaults to False.
-        pklfile_prefix (str, optional): The prefix of pkl files, including the
-            file path and the prefix of filename, e.g., "a/b/prefix". If not
-            specified, a temp file will be created. Defaults to None.
-        submission_prefix (str, optional): The prefix of submission data. If
-            not specified, the submission data will not be generated.
-            Defaults to None.
         load_type (str): Type of loading mode during training.
-
             - 'frame_based': Load all of the instances in the frame.
             - 'mv_image_based': Load all of the instances in the frame and need
               to convert to the FOV-based data type to support image-based
@@ -56,73 +32,98 @@ class WaymoMetric(KittiMetric):
             - 'fov_image_based': Only load the instances inside the default cam
               and need to convert to the FOV-based data type to support image-
               based detector.
-        default_cam_key (str): The default camera for lidar to camera
-            conversion. By default, KITTI: 'CAM2', Waymo: 'CAM_FRONT'.
-            Defaults to 'CAM_FRONT'.
-        use_pred_sample_idx (bool): In formating results, use the sample index
-            from the prediction or from the load annotations. By default,
-            KITTI: True, Waymo: False, Waymo has a conversion process, which
-            needs to use the sample idx from load annotation.
-            Defaults to False.
-        collect_device (str): Device name used for collecting results from
-            different ranks during distributed training. Must be 'cpu' or
-            'gpu'. Defaults to 'cpu'.
-        backend_args (dict, optional): Arguments to instantiate the
-            corresponding backend. Defaults to None.
-        idx2metainfo (str, optional): The file path of the metainfo in waymo.
-            It stores the mapping from sample_idx to metainfo. The metainfo
-            must contain the keys: 'idx2contextname' and 'idx2timestamp'.
+        result_prefix (str, optional): The prefix of result '*.bin' file,
+            including the file path and the prefix of filename, e.g.,
+            "a/b/prefix". If not specified, a temp file will be created.
             Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result to a
+            specific format and submit it to the test server.
+            Defaults to False.
+        nms_cfg (dict): The configuration of non-maximum suppression for
+            the mergence of multi-image predicted bboxes, only use when
+            load_type == 'mv_image_based'. Defaults to None.
     """
     num_cams = 5
+    default_prefix = 'Waymo metric'
 
     def __init__(self,
-                 ann_file: str,
                  waymo_bin_file: str,
-                 data_root: str,
-                 split: str = 'training',
                  metric: Union[str, List[str]] = 'mAP',
-                 pcd_limit_range: List[float] = [-85, -85, -5, 85, 85, 5],
-                 convert_kitti_format: bool = True,
-                 prefix: Optional[str] = None,
-                 format_only: bool = False,
-                 pklfile_prefix: Optional[str] = None,
-                 submission_prefix: Optional[str] = None,
                  load_type: str = 'frame_based',
-                 default_cam_key: str = 'CAM_FRONT',
-                 use_pred_sample_idx: bool = False,
-                 collect_device: str = 'cpu',
-                 backend_args: Optional[dict] = None,
-                 idx2metainfo: Optional[str] = None) -> None:
+                 result_prefix: Optional[str] = None,
+                 format_only: bool = False,
+                 nms_cfg=None,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
         self.waymo_bin_file = waymo_bin_file
-        self.data_root = data_root
-        self.split = split
+        self.metrics = metric if isinstance(metric, list) else [metric]
         self.load_type = load_type
-        self.use_pred_sample_idx = use_pred_sample_idx
-        self.convert_kitti_format = convert_kitti_format
-
-        if idx2metainfo is not None:
-            self.idx2metainfo = mmengine.load(idx2metainfo)
-        else:
-            self.idx2metainfo = None
-
-        super(WaymoMetric, self).__init__(
-            ann_file=ann_file,
-            metric=metric,
-            pcd_limit_range=pcd_limit_range,
-            prefix=prefix,
-            pklfile_prefix=pklfile_prefix,
-            submission_prefix=submission_prefix,
-            default_cam_key=default_cam_key,
-            collect_device=collect_device,
-            backend_args=backend_args)
+        self.result_prefix = result_prefix
         self.format_only = format_only
         if self.format_only:
-            assert pklfile_prefix is not None, 'pklfile_prefix must be not '
+            assert result_prefix is not None, 'result_prefix must be not '
             'None when format_only is True, otherwise the result files will '
             'be saved to a temp directory which will be cleaned up at the end.'
+        if nms_cfg is not None:
+            assert load_type == 'mv_image_based', 'nms_cfg in WaymoMetric '
+            'only use when load_type == \'mv_image_based\'.'
+            self.nms_cfg = Config(nms_cfg)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
 
-        self.default_prefix = 'Waymo metric'
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+            bboxes_3d = data_sample['pred_instances_3d']['bboxes_3d']
+            bboxes_3d.limit_yaw(offset=0.5, period=np.pi * 2)
+            scores_3d = data_sample['pred_instances_3d']['scores_3d']
+            labels_3d = data_sample['pred_instances_3d']['labels_3d']
+            # TODO: check lidar post-processing
+            if isinstance(bboxes_3d, CameraInstance3DBoxes):
+                box_corners = bboxes_3d.corners
+                cam2img = box_corners.new_tensor(
+                    np.array(data_sample['cam2img']))
+                box_corners_in_image = points_cam2img(box_corners, cam2img)
+                # box_corners_in_image: [N, 8, 2]
+                minxy = torch.min(box_corners_in_image, dim=1)[0]
+                maxxy = torch.max(box_corners_in_image, dim=1)[0]
+                # check minxy & maxxy
+                # if the projected 2d bbox has intersection
+                # with the image, we keep it, otherwise, we omit it.
+                img_shape = data_sample['img_shape']
+                valid_inds = ((minxy[:, 0] < img_shape[1]) &
+                              (minxy[:, 1] < img_shape[0]) & (maxxy[:, 0] > 0)
+                              & (maxxy[:, 1] > 0))
+
+                if valid_inds.sum() > 0:
+                    lidar2cam = data_sample['lidar2cam']
+                    bboxes_3d = bboxes_3d.convert_to(
+                        Box3DMode.LIDAR,
+                        np.linalg.inv(lidar2cam),
+                        correct_yaw=True)
+                    bboxes_3d = bboxes_3d[valid_inds]
+                    scores_3d = scores_3d[valid_inds]
+                    labels_3d = labels_3d[valid_inds]
+                else:
+                    bboxes_3d = torch.zeros([0, 7])
+                    scores_3d = torch.zeros([0])
+                    labels_3d = torch.zeros([0])
+            result['bboxes_3d'] = bboxes_3d.tensor.cpu().numpy()
+            result['scores_3d'] = scores_3d.cpu().numpy()
+            result['labels_3d'] = labels_3d.cpu().numpy()
+            result['sample_idx'] = data_sample['sample_idx']
+            result['context_name'] = data_sample['context_name']
+            result['timestamp'] = data_sample['timestamp']
+            self.results.append(result)
 
     def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         """Compute the metrics from processed results.
@@ -137,80 +138,49 @@ def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         logger: MMLogger = MMLogger.get_current_instance()
         self.classes = self.dataset_meta['classes']
 
-        # load annotations
-        self.data_infos = load(self.ann_file)['data_list']
-        assert len(results) == len(self.data_infos), \
-            'invalid list length of network outputs'
         # different from kitti, waymo do not need to convert the ann file
         # handle the mv_image_based load_mode
         if self.load_type == 'mv_image_based':
-            new_data_infos = []
-            for info in self.data_infos:
-                height = info['images'][self.default_cam_key]['height']
-                width = info['images'][self.default_cam_key]['width']
-                for (cam_key, img_info) in info['images'].items():
-                    camera_info = dict()
-                    camera_info['images'] = dict()
-                    camera_info['images'][cam_key] = img_info
-                    # TODO remove the check by updating the data info;
-                    if 'height' not in img_info:
-                        img_info['height'] = height
-                        img_info['width'] = width
-                    if 'cam_instances' in info \
-                            and cam_key in info['cam_instances']:
-                        camera_info['instances'] = info['cam_instances'][
-                            cam_key]
-                    else:
-                        camera_info['instances'] = []
-                    camera_info['ego2global'] = info['ego2global']
-                    if 'image_sweeps' in info:
-                        camera_info['image_sweeps'] = info['image_sweeps']
-
-                    # TODO check if need to modify the sample idx
-                    # TODO check when will use it except for evaluation.
-                    camera_info['sample_idx'] = info['sample_idx']
-                    new_data_infos.append(camera_info)
-            self.data_infos = new_data_infos
-
-        if self.pklfile_prefix is None:
+            assert len(results) % 5 == 0, 'The multi-view image-based results'
+            ' must be 5 times as large as the original frame-based results.'
+            frame_results = [
+                results[i:i + 5] for i in range(0, len(results), 5)
+            ]
+            results = self.merge_multi_view_boxes(frame_results)
+
+        if self.result_prefix is None:
             eval_tmp_dir = tempfile.TemporaryDirectory()
-            pklfile_prefix = osp.join(eval_tmp_dir.name, 'results')
+            result_prefix = osp.join(eval_tmp_dir.name, 'results')
         else:
             eval_tmp_dir = None
-            pklfile_prefix = self.pklfile_prefix
+            result_prefix = self.result_prefix
 
-        result_dict, tmp_dir = self.format_results(
-            results,
-            pklfile_prefix=pklfile_prefix,
-            submission_prefix=self.submission_prefix,
-            classes=self.classes)
+        self.format_results(results, result_prefix=result_prefix)
 
         metric_dict = {}
 
         if self.format_only:
             logger.info('results are saved in '
-                        f'{osp.dirname(self.pklfile_prefix)}')
+                        f'{osp.dirname(self.result_prefix)}')
             return metric_dict
 
         for metric in self.metrics:
             ap_dict = self.waymo_evaluate(
-                pklfile_prefix, metric=metric, logger=logger)
+                result_prefix, metric=metric, logger=logger)
             metric_dict.update(ap_dict)
         if eval_tmp_dir is not None:
             eval_tmp_dir.cleanup()
 
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
         return metric_dict
 
     def waymo_evaluate(self,
-                       pklfile_prefix: str,
+                       result_prefix: str,
                        metric: Optional[str] = None,
                        logger: Optional[MMLogger] = None) -> Dict[str, float]:
         """Evaluation in Waymo protocol.
 
         Args:
-            pklfile_prefix (str): The location that stored the prediction
+            result_prefix (str): The location that stored the prediction
                 results.
             metric (str, optional): Metric to be evaluated. Defaults to None.
             logger (MMLogger, optional): Logger used for printing related
@@ -224,7 +194,7 @@ def waymo_evaluate(self,
 
         if metric == 'mAP':
             eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
-                f'compute_detection_metrics_main {pklfile_prefix}.bin ' + \
+                f'compute_detection_metrics_main {result_prefix}.bin ' + \
                 f'{self.waymo_bin_file}'
             print(eval_str)
             ret_bytes = subprocess.check_output(eval_str, shell=True)
@@ -275,7 +245,7 @@ def waymo_evaluate(self,
                     ap_dict['Cyclist/L2 mAPH']) / 3
         elif metric == 'LET_mAP':
             eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
-                f'compute_detection_let_metrics_main {pklfile_prefix}.bin ' + \
+                f'compute_detection_let_metrics_main {result_prefix}.bin ' + \
                 f'{self.waymo_bin_file}'
 
             print(eval_str)
@@ -325,76 +295,26 @@ def waymo_evaluate(self,
     def format_results(
         self,
         results: List[dict],
-        pklfile_prefix: Optional[str] = None,
-        submission_prefix: Optional[str] = None,
-        classes: Optional[List[str]] = None
+        result_prefix: Optional[str] = None
     ) -> Tuple[dict, Union[tempfile.TemporaryDirectory, None]]:
         """Format the results to bin file.
 
         Args:
             results (List[dict]): Testing results of the dataset.
-            pklfile_prefix (str, optional): The prefix of pkl files. It
+            result_prefix (str, optional): The prefix of result file. It
                 includes the file path and the prefix of filename, e.g.,
                 "a/b/prefix". If not specified, a temp file will be created.
                 Defaults to None.
-            submission_prefix (str, optional): The prefix of submitted files.
-                It includes the file path and the prefix of filename, e.g.,
-                "a/b/prefix". If not specified, a temp file will be created.
-                Defaults to None.
-            classes (List[str], optional): A list of class name.
-                Defaults to None.
-
-        Returns:
-            tuple: (result_dict, tmp_dir), result_dict is a dict containing the
-            formatted result, tmp_dir is the temporal directory created for
-            saving json files when jsonfile_prefix is not specified.
         """
-        waymo_save_tmp_dir = tempfile.TemporaryDirectory()
-        waymo_results_save_dir = waymo_save_tmp_dir.name
-        waymo_results_final_path = f'{pklfile_prefix}.bin'
-
-        if self.convert_kitti_format:
-            results_kitti_format, tmp_dir = super().format_results(
-                results, pklfile_prefix, submission_prefix, classes)
-            final_results = results_kitti_format['pred_instances_3d']
-        else:
-            final_results = results
-            for i, res in enumerate(final_results):
-                # Actually, `sample_idx` here is the filename without suffix.
-                # It's for identitying the sample in formating.
-                res['sample_idx'] = self.data_infos[i]['sample_idx']
-                res['pred_instances_3d']['bboxes_3d'].limit_yaw(
-                    offset=0.5, period=np.pi * 2)
-
-        waymo_root = self.data_root
-        if self.split == 'training':
-            waymo_tfrecords_dir = osp.join(waymo_root, 'validation')
-            prefix = '1'
-        elif self.split == 'testing':
-            waymo_tfrecords_dir = osp.join(waymo_root, 'testing')
-            prefix = '2'
-        else:
-            raise ValueError('Not supported split value.')
+        waymo_results_final_path = f'{result_prefix}.bin'
 
         from ..functional.waymo_utils.prediction_to_waymo import \
             Prediction2Waymo
-        converter = Prediction2Waymo(
-            final_results,
-            waymo_tfrecords_dir,
-            waymo_results_save_dir,
-            waymo_results_final_path,
-            prefix,
-            classes,
-            backend_args=self.backend_args,
-            from_kitti_format=self.convert_kitti_format,
-            idx2metainfo=self.idx2metainfo)
+        converter = Prediction2Waymo(results, waymo_results_final_path,
+                                     self.classes)
         converter.convert()
-        waymo_save_tmp_dir.cleanup()
-
-        return final_results, waymo_save_tmp_dir
 
-    def merge_multi_view_boxes(self, box_dict_per_frame: List[dict],
-                               cam0_info: dict) -> dict:
+    def merge_multi_view_boxes(self, frame_results: List[dict]) -> dict:
         """Merge bounding boxes predicted from multi-view images.
 
         Args:
@@ -403,308 +323,43 @@ def merge_multi_view_boxes(self, box_dict_per_frame: List[dict],
             cam0_info (dict): Store the sample idx for the given frame.
 
         Returns:
-            dict: Merged results.
-        """
-        box_dict = dict()
-        # convert list[dict] to dict[list]
-        for key in box_dict_per_frame[0].keys():
-            box_dict[key] = list()
-            for cam_idx in range(self.num_cams):
-                box_dict[key].append(box_dict_per_frame[cam_idx][key])
-        # merge each elements
-        box_dict['sample_idx'] = cam0_info['image_id']
-        for key in ['bbox', 'box3d_lidar', 'scores', 'label_preds']:
-            box_dict[key] = np.concatenate(box_dict[key])
-
-        # apply nms to box3d_lidar (box3d_camera are in different systems)
-        # TODO: move this global setting into config
-        nms_cfg = dict(
-            use_rotate_nms=True,
-            nms_across_levels=False,
-            nms_pre=500,
-            nms_thr=0.05,
-            score_thr=0.001,
-            min_bbox_size=0,
-            max_per_frame=100)
-        nms_cfg = Config(nms_cfg)
-        lidar_boxes3d = LiDARInstance3DBoxes(
-            torch.from_numpy(box_dict['box3d_lidar']).cuda())
-        scores = torch.from_numpy(box_dict['scores']).cuda()
-        labels = torch.from_numpy(box_dict['label_preds']).long().cuda()
-        nms_scores = scores.new_zeros(scores.shape[0], len(self.classes) + 1)
-        indices = labels.new_tensor(list(range(scores.shape[0])))
-        nms_scores[indices, labels] = scores
-        lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
-        boxes3d = lidar_boxes3d.tensor
-        # generate attr scores from attr labels
-        boxes3d, scores, labels = box3d_multiclass_nms(
-            boxes3d, lidar_boxes3d_for_nms, nms_scores, nms_cfg.score_thr,
-            nms_cfg.max_per_frame, nms_cfg)
-        lidar_boxes3d = LiDARInstance3DBoxes(boxes3d)
-        det = bbox3d2result(lidar_boxes3d, scores, labels)
-        box_preds_lidar = det['bboxes_3d']
-        scores = det['scores_3d']
-        labels = det['labels_3d']
-        # box_preds_camera is in the cam0 system
-        lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
-        lidar2cam = np.array(lidar2cam).astype(np.float32)
-        box_preds_camera = box_preds_lidar.convert_to(
-            Box3DMode.CAM, lidar2cam, correct_yaw=True)
-        # Note: bbox is meaningless in final evaluation, set to 0
-        merged_box_dict = dict(
-            bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
-            box3d_camera=box_preds_camera.numpy(),
-            box3d_lidar=box_preds_lidar.numpy(),
-            scores=scores.numpy(),
-            label_preds=labels.numpy(),
-            sample_idx=box_dict['sample_idx'],
-        )
-        return merged_box_dict
-
-    def bbox2result_kitti(
-            self,
-            net_outputs: List[dict],
-            sample_idx_list: List[int],
-            class_names: List[str],
-            pklfile_prefix: Optional[str] = None,
-            submission_prefix: Optional[str] = None) -> List[dict]:
-        """Convert 3D detection results to kitti format for evaluation and test
-        submission.
-
-        Args:
-            net_outputs (List[dict]): List of dict storing the inferenced
-                bounding boxes and scores.
-            sample_idx_list (List[int]): List of input sample idx.
-            class_names (List[str]): A list of class names.
-            pklfile_prefix (str, optional): The prefix of pkl file.
-                Defaults to None.
-            submission_prefix (str, optional): The prefix of submission file.
-                Defaults to None.
-
-        Returns:
-            List[dict]: A list of dictionaries with the kitti format.
+            Dict: Merged results.
         """
-        if submission_prefix is not None:
-            mmengine.mkdir_or_exist(submission_prefix)
-
-        det_annos = []
-        print('\nConverting prediction to KITTI format')
-        for idx, pred_dicts in enumerate(
-                mmengine.track_iter_progress(net_outputs)):
-            sample_idx = sample_idx_list[idx]
-            info = self.data_infos[sample_idx]
-
-            if self.load_type == 'mv_image_based':
-                if idx % self.num_cams == 0:
-                    box_dict_per_frame = []
-                    cam0_key = list(info['images'].keys())[0]
-                    cam0_info = info
-                    # Here in mono3d, we use the 'CAM_FRONT' "the first
-                    # index in the camera" as the default image shape.
-                    # If you want to another camera, please modify it.
-                    image_shape = (info['images'][cam0_key]['height'],
-                                   info['images'][cam0_key]['width'])
-                box_dict = self.convert_valid_bboxes(pred_dicts, info)
-            else:
-                box_dict = self.convert_valid_bboxes(pred_dicts, info)
-                # Here default used 'CAM_FRONT' to compute metric.
-                # If you want to use another camera, please modify it.
-                image_shape = (info['images'][self.default_cam_key]['height'],
-                               info['images'][self.default_cam_key]['width'])
-            if self.load_type == 'mv_image_based':
-                box_dict_per_frame.append(box_dict)
-                if (idx + 1) % self.num_cams != 0:
-                    continue
-                box_dict = self.merge_multi_view_boxes(box_dict_per_frame,
-                                                       cam0_info)
-
-            anno = {
-                'name': [],
-                'truncated': [],
-                'occluded': [],
-                'alpha': [],
-                'bbox': [],
-                'dimensions': [],
-                'location': [],
-                'rotation_y': [],
-                'score': []
-            }
-            if len(box_dict['bbox']) > 0:
-                box_2d_preds = box_dict['bbox']
-                box_preds = box_dict['box3d_camera']
-                scores = box_dict['scores']
-                box_preds_lidar = box_dict['box3d_lidar']
-                label_preds = box_dict['label_preds']
-
-                for box, box_lidar, bbox, score, label in zip(
-                        box_preds, box_preds_lidar, box_2d_preds, scores,
-                        label_preds):
-                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
-                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
-                    anno['name'].append(class_names[int(label)])
-                    anno['truncated'].append(0.0)
-                    anno['occluded'].append(0)
-                    anno['alpha'].append(
-                        -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
-                    anno['bbox'].append(bbox)
-                    anno['dimensions'].append(box[3:6])
-                    anno['location'].append(box[:3])
-                    anno['rotation_y'].append(box[6])
-                    anno['score'].append(score)
-
-                anno = {k: np.stack(v) for k, v in anno.items()}
-            else:
-                anno = {
-                    'name': np.array([]),
-                    'truncated': np.array([]),
-                    'occluded': np.array([]),
-                    'alpha': np.array([]),
-                    'bbox': np.zeros([0, 4]),
-                    'dimensions': np.zeros([0, 3]),
-                    'location': np.zeros([0, 3]),
-                    'rotation_y': np.array([]),
-                    'score': np.array([]),
-                }
-
-            if submission_prefix is not None:
-                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
-                with open(curr_file, 'w') as f:
-                    bbox = anno['bbox']
-                    loc = anno['location']
-                    dims = anno['dimensions']  # lhw -> hwl
-
-                    for idx in range(len(bbox)):
-                        print(
-                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
-                                anno['name'][idx], anno['alpha'][idx],
-                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
-                                bbox[idx][3], dims[idx][1], dims[idx][2],
-                                dims[idx][0], loc[idx][0], loc[idx][1],
-                                loc[idx][2], anno['rotation_y'][idx],
-                                anno['score'][idx]),
-                            file=f)
-            if self.use_pred_sample_idx:
-                save_sample_idx = sample_idx
-            else:
-                # use the sample idx in the info file
-                # In waymo validation sample_idx in prediction is 000xxx
-                # but in info file it is 1000xxx
-                save_sample_idx = box_dict['sample_idx']
-            anno['sample_idx'] = np.array(
-                [save_sample_idx] * len(anno['score']), dtype=np.int64)
-
-            det_annos.append(anno)
-
-        if pklfile_prefix is not None:
-            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
-                out = f'{pklfile_prefix}.pkl'
-            else:
-                out = pklfile_prefix
-            mmengine.dump(det_annos, out)
-            print(f'Result is saved to {out}.')
-
-        return det_annos
-
-    def convert_valid_bboxes(self, box_dict: dict, info: dict) -> dict:
-        """Convert the predicted boxes into valid ones. Should handle the
-        load_model (frame_based, mv_image_based, fov_image_based), separately.
-
-        Args:
-            box_dict (dict): Box dictionaries to be converted.
-
-                - bboxes_3d (:obj:`BaseInstance3DBoxes`): 3D bounding boxes.
-                - scores_3d (Tensor): Scores of boxes.
-                - labels_3d (Tensor): Class labels of boxes.
-            info (dict): Data info.
-
-        Returns:
-            dict: Valid predicted boxes.
-
-            - bbox (np.ndarray): 2D bounding boxes.
-            - box3d_camera (np.ndarray): 3D bounding boxes in camera
-              coordinate.
-            - box3d_lidar (np.ndarray): 3D bounding boxes in LiDAR coordinate.
-            - scores (np.ndarray): Scores of boxes.
-            - label_preds (np.ndarray): Class label predictions.
-            - sample_idx (int): Sample index.
-        """
-        # TODO: refactor this function
-        box_preds = box_dict['bboxes_3d']
-        scores = box_dict['scores_3d']
-        labels = box_dict['labels_3d']
-        sample_idx = info['sample_idx']
-        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
-
-        if len(box_preds) == 0:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx)
-        # Here default used 'CAM_FRONT' to compute metric. If you want to
-        # use another camera, please modify it.
-        if self.load_type in ['frame_based', 'fov_image_based']:
-            cam_key = self.default_cam_key
-        elif self.load_type == 'mv_image_based':
-            cam_key = list(info['images'].keys())[0]
-        else:
-            raise NotImplementedError
-
-        lidar2cam = np.array(info['images'][cam_key]['lidar2cam']).astype(
-            np.float32)
-        P2 = np.array(info['images'][cam_key]['cam2img']).astype(np.float32)
-        img_shape = (info['images'][cam_key]['height'],
-                     info['images'][cam_key]['width'])
-        P2 = box_preds.tensor.new_tensor(P2)
-
-        if isinstance(box_preds, LiDARInstance3DBoxes):
-            box_preds_camera = box_preds.convert_to(Box3DMode.CAM, lidar2cam)
-            box_preds_lidar = box_preds
-        elif isinstance(box_preds, CameraInstance3DBoxes):
-            box_preds_camera = box_preds
-            box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
-                                                   np.linalg.inv(lidar2cam))
-
-        box_corners = box_preds_camera.corners
-        box_corners_in_image = points_cam2img(box_corners, P2)
-        # box_corners_in_image: [N, 8, 2]
-        minxy = torch.min(box_corners_in_image, dim=1)[0]
-        maxxy = torch.max(box_corners_in_image, dim=1)[0]
-        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
-        # Post-processing
-        # check box_preds_camera
-        image_shape = box_preds.tensor.new_tensor(img_shape)
-        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
-                          (box_2d_preds[:, 1] < image_shape[0]) &
-                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
-        # check box_preds_lidar
-        if self.load_type in ['frame_based']:
-            limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
-            valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
-                              (box_preds_lidar.center < limit_range[3:]))
-            valid_inds = valid_pcd_inds.all(-1)
-        elif self.load_type in ['mv_image_based', 'fov_image_based']:
-            valid_inds = valid_cam_inds
-
-        if valid_inds.sum() > 0:
-            return dict(
-                bbox=box_2d_preds[valid_inds, :].numpy(),
-                pred_box_type_3d=type(box_preds),
-                box3d_camera=box_preds_camera[valid_inds].numpy(),
-                box3d_lidar=box_preds_lidar[valid_inds].numpy(),
-                scores=scores[valid_inds].numpy(),
-                label_preds=labels[valid_inds].numpy(),
-                sample_idx=sample_idx)
-        else:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                pred_box_type_3d=type(box_preds),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0]),
-                sample_idx=sample_idx)
+        merged_results = []
+        for frame_result in frame_results:
+            merged_result = dict()
+            merged_result['sample_idx'] = frame_result[0]['sample_idx'] // 5
+            merged_result['context_name'] = frame_result[0]['context_name']
+            merged_result['timestamp'] = frame_result[0]['timestamp']
+            bboxes_3d, scores_3d, labels_3d = [], [], []
+            for result in frame_result:
+                assert result['timestamp'] == merged_result['timestamp']
+                bboxes_3d.append(result['bboxes_3d'])
+                scores_3d.append(result['scores_3d'])
+                labels_3d.append(result['labels_3d'])
+
+            bboxes_3d = np.concatenate(bboxes_3d)
+            scores_3d = np.concatenate(scores_3d)
+            labels_3d = np.concatenate(labels_3d)
+
+            device = get_device()
+            lidar_boxes3d = LiDARInstance3DBoxes(
+                torch.from_numpy(bboxes_3d).to(device))
+            scores = torch.from_numpy(scores_3d).to(device)
+            labels = torch.from_numpy(labels_3d).long().to(device)
+            nms_scores = scores.new_zeros(scores.shape[0],
+                                          len(self.classes) + 1)
+            indices = labels.new_tensor(list(range(scores.shape[0])))
+            nms_scores[indices, labels] = scores
+            lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
+            boxes3d = lidar_boxes3d.tensor
+            bboxes_3d, scores_3d, labels_3d = box3d_multiclass_nms(
+                boxes3d, lidar_boxes3d_for_nms, nms_scores,
+                self.nms_cfg.score_thr, self.nms_cfg.max_per_frame,
+                self.nms_cfg)
+
+            merged_result['bboxes_3d'] = bboxes_3d.cpu().numpy()
+            merged_result['scores_3d'] = scores_3d.cpu().numpy()
+            merged_result['labels_3d'] = labels_3d.cpu().numpy()
+            merged_results.append(merged_result)
+        return merged_results
diff --git a/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py b/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
index 14bcbb9296..5b207c7992 100644
--- a/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
+++ b/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
@@ -179,7 +179,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 
 dataset_type = 'WaymoDataset'
@@ -223,13 +226,7 @@
 test_dataloader = val_dataloader
 
 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/tests/data/waymo/kitti_format/waymo_infos_train.pkl b/tests/data/waymo/kitti_format/waymo_infos_train.pkl
index e89255005950effc80d8d4bab100bfef12856f30..f2f587c6ad343c69b1276de411fe8673a11e0d1f 100644
GIT binary patch
literal 2020
zcma)7TWl0n7~U<E!g94J7X_)JIEt>@3rmsOoyui^(jp7hK^bRfcTaZ)c4qdUne8QH
z>jI$`g_0JGw!W;1Ml>-dl9(vSZgxpDpu`w8zL?ZVVqy%{2O=+0|8u4{jSoE!Gv9pY
zJOBA_=S;RN10GJC=ka*rilfL$&33>m0n)QL(TB9H=&2-;wRjiIZuT&xqv^zUkcaYD
zD~d<q=GCL+OiL1GO(JPVl_boOpn{WNE+>R;ic$^@3Hj@lHzE{-uI)X-K;i5=7jwuE
z3Fe{j>mO#1;Fk~@U3nPa2;M%ngQtMMGE^QxfI=T9<gcCmt%}m2flF(621XkXL@3~c
zmXSk%!ugzlfQ|nFp+DRyLc|4#Vy5AcE=M*LoxlQd9a7bSK=sDJra*PDK2TQ|tlx-3
z4%XHO<Uk-G2kY>G02YdKP^{@ng4nPq1B+RTwluZK+rxWW_rsD5+{KA=nI<8p6i0<h
zaXx<HVmkbXQVCefjHpCQs1DrS;wcx)QPHWzQGHKKwu0GAYH`J?i77fPlVCX}cLk<G
z`&^BnRLGCsdAC48XmDSz%X}EzlSKwou@Z%y1H&#o*m}_gqj&nNa9-d&f4`MOfWmvR
z3ZrA+j!-&(=-h@Z1D$ObT;{gPMUX+^eKSpN1?th|8ck2YDhXDj+xK$<pIg3(TAC^h
zf4a20YC40PteLjS<O=@ycOU2eBdnSx#3?p}i3(5dfu*K>?xvCY^MC{oqCacVpVMa!
z7I01|<W_wep@0vWo2wY}=9yo!2+*I0P<TB3MTF5?Qa=_L$gTd2G5iO^Z6nB_@Zp)I
zk)&<be9R*`Wom|F!@3MS;-0*5nsyBLua1do20SV*#J!BPE~gw@ZYK&ZeXMy-nuVwt
zcsz|5HspE<9?!rNZUZJPGo8Yd4E|;>(_vwVrBLQcJleC-U^HrWK|q3P47LCAm%p!B
z+92?qr*B-}xx^9p!JoQTj$P$33;M0CuOIn_uv1Uh$Vnwil2C(YXvG;-8;4p6>d;c?
z*3UU)g+kf)BO?^>-uKoQ7+CSzM_C5S#w>i})V!gg90mDnZO14eEYhh>WVE{h_nC-)
zJWUvXuw{cct%#Ml2qTm2ZX+g7rDCH5^%>a2sTjSA@v0c+Q!@)w#kf?AWiyTCubv2Q
zfU#V^%>eH`?-`*CKZ96Q^jsbHlLkQjoN}=t15c|<BxuY)h@<}TTntN`>h{M>i}>|)
z($N&lQo4P1Oi3!151QP0U)<?V5g$AwmZ8!1df0NZ?-ke@hHYH)EYXWx18i??ZQV77
zgFTu(4hYeh95dr2X8VXife&`DhV<8$hI>kVr4z+|9(J<3DXBP`Q4PC#p*ak@d!Z!^
zt!k6HO?6kSEq+oOF1=Aa51l;hQBn46820u;I1Ky7WsA^@(opea>HX5V#?q<cAP@Vg
zuq_PF^}_RE5U1I*il-9A%caZh#p662pt2Xj@M13<41+`_%JCAG!~2VsMZ-e=s|(BU
zja6vWdkPFpmSdu$_=z6N-@4L5HH4PKJqZNwe@m{3gDgP=?GA)0v&_oNe_o1EDZ7#5
zB!#C@ncPHNj8o&yE^Bs&g3SogK6c%Akfa&YoNkD*gr^l7dz1bLbmAzQ(QZOK1B7Ge
zh5mbO;y}B5?<?K92?;jSK`dJ{4N%!p>!dSsN438WG*&5UN1o6eVwA4OiBlVQRfp4Q
F|33rVS$zNi

literal 3445
zcmb`Kd3Y4%9mh8Zkc1$qV!?_>0Wnsa1cMN(kVPvhs{vk#3}{?tb|=}T+1dT>z6*&o
zZ6J3b!Guc=2@%jnZWT*G*!K7Kw5`4G`@Zdc{nftj%+3HM+K1<H_L-fTci#8=ozL(0
z&ilU8np+273vx=eLZ8paqcJnZ^;iPAz2j&--gs^O7;TKa5sm2$Ch}U4?^Lv}gVAsk
z#sqv?ky|5}(0K%7wW&%gsYk7-)llx#tthW>wz7oPfN|P*uT3hVCrlKCtaL2VTo#Ha
z&9Yd!kw^7pQg1E_Ce2XXO7Ubm#EVeq)GD&V&RrA*d>t4c3WnlgGn6VaEp<|aVl7AN
z8<?PtmB&V%N3B`7q7xGtO1kh4hAT@0If2{`T$L3J^X7zXH%ZG?&ovuY_jGsRoeYyp
z19`UA-I!8SQ?uwla<l&eK8|aQo=CABIG+p>wT|=o@UDooxt3un$8|>9#!SSc77vCR
z<LNN2Z@~?vTCp^dxWRf;=jo)G!i|Bk+5}m+#%zp-n=R7`<h!-&WSJL8_of#~BWht<
z+D=eNZ;Tq4-hvsWHo}q$+!QH{*x{KOk<2$Ulq%-@Kz=7~i8!G3Rd|mAy_MlM2^zT+
z@x3-;zkG}uqeQ!NGVSxrb9-B~j@rr}5^dDy&z5)xuP&Vs!(xQC(Tid$wOvxrYha&<
z<JA`zyxEnkh%y!Da)t_yS%!l=3%9$-l?r(_!yO*-oXBk|S%Xqj?4bRLdDJUT(~#fw
zpgp4fMmnBsqJFQ;y?1egIIYjWZydE>Qq8UI950JzuA-@8xRc|3S(@8%mrHZEqM64q
z-=nE^Xr7W*V#~uu$&~RIBu4a8w-^+A#Xj0FN^;IOphNSQKQ@a_zP*|8E0zTe_i)^s
zWvRsbU6%V4%R+|xJ(dR?mOjPO6_`ssGQJnk)QLR{^P1RbX}O?wnzl^Ie>6YtcY^<B
z|Cg1N2NlR7h5*MyS&-Ry*afLkAd4B6cpyt1kX~s{ho`O#-*sV9*5KL+tCo#IalYzA
z+EYo-(#xsX+eyI`#Ul*MIBE?|4pBZ<p$1isM_rWV3PodB;i0T_P};pXUQXb=MsIm?
zIf{5pK|Ib-$MJ-rT_J^+C4*`_4Dxs|q{k3+Sym|)ox$)}LJrGm*{GNfE}9mhvr)=S
z6W#Ka-#`b&IdP0KbcS|{0oqKRkI-PX*gs)Ka`7BGB5(B3v1N;C7Y(eH7hA<h?hHCp
zLxb|7n|6y+6RF=(D533if`(<;S)YG0wJsDR@+HiOW7RX~QZF68gSM`wOrChIO8UK2
zE7j^~q+vm6T%4lJTSm$!Lxr%4fXPtL(IDrQeAyxqPnn3g;86v>n&AT;c%uV;R94ki
ziXIxO)5OtgaaQy$UnLG)DTj$n?agA~iFHfG#_Cn&G(@{)7!J@D(OM;r6pEpIv0e04
zi%o_p&aJ*ilF)G(rM(LtkmJg5wb=fk*i}15mbClnAnh-hE?$srH&RAyE1Y!C4fHAv
zR2M{wm#mmCn~I^uGL$>4<Q)J1hm9#%iy_XDFzoqFx2$*yXhE{nX1Da1iImIE6?>Xt
zjbhK~z}oEmR$GZCHNQ18Jn7DFfnqra7I3VyXErqx-z#@r?n$}hgUY&v;X@oBHk?IK
zGHr!)Zidn7`mI-f8yMQMe(mSy(+=g=$<XEcUA$!Iwp|OPle?iuIrcJa<k)0r>Q+1y
zYD|ZvVV`T*uM9UcJe4&Z$QnkgFsKZ-Fl-&&aNDJZ+m+!ChNn4pW~1Afi0Ihmy6skO
zdl;U{y6ug0c?%u;v1f=@3@Md;48t7H8g@<$Bi@AluE+r;a**LUTSTooo_G3VrIZkT
zVw3Zub8<`+uarJco8{Zsx7I(e$ewP+Gc@RDbn1HQ5Qk!NhQddb;RwSE9ES{h%^8-<
zRVx*h%R?M?ZI3A1qYTF!+q_O3cWh@$Tje26Dz7Pi`AY9}e}&!?h?BW~oKQ|LGQ7m`
zvXha<xa<Ma2q#^iSCr338BTdVA9H-BE4;%?vrnGmzr`k#u40oMGmhRbhM$~d&G+N9
zGC9NWD#uv|CmH9qNw68`T$_xt`8dNTvZ){^6>0<&Z4Qb;q6~!6`y|7sI6m!UHEM;;
zCVa+~`mB=r9K&l_sn5H8Xg0o}`p_2{ULU;=eaY@aYRFL=zO0<S!thm&uQ{(sLo%LD
z$j^ZkzV5nxL%DsE;agd^Z%1BNeaI8}juQDU!}mD8?}*4@+)V0H1wU{_eyBu##PH*+
z$WI*li(a3fDxse-{G8(#ZnvnfkA~zYL^FQrD*Z|+{hHx7S*73p&*S_~sr;Vd4;*h8
zT8YF|>vMbMo(e{-U<1b=U9CSUtv@sTC9Cz<^IGNdvjcxqT7PHw2gg6<P_4c|Qfo~!
Nk;1>~@b9!<_Fq_JBP0L-

diff --git a/tests/data/waymo/kitti_format/waymo_infos_val.pkl b/tests/data/waymo/kitti_format/waymo_infos_val.pkl
index 0ce2230bb123b686dc3a4c337003fce992fbd8e4..82a6ed4704e8b4e8f07aa13994a6c92ca1b7eab8 100644
GIT binary patch
literal 2020
zcma)7Uu;uV81DvKM;Y>O!j!2M6t5tqYxjpK*fYksu(}}y<O*ExZriioJ9^uGy|;ss
zlr#Y<GmaP{NMfQ*#D9wkAtn+dvDelIUyN_U<MhE7LdF9q5Xk(_>DbWlz)9Nv^80@0
zJKw)^ny<})Ly+b<9FC}JsY+ZoEvS(J={bVrLfTY~RGcb$^f1irBbygGxY*JSYFcQ5
zy7!zxV`E}MtK)JSOZsB*WLS+uy$nu4h6X__*`6kJ<q%7gmp&fk;QU9qA_L^|x-D)5
zREBwi7?}KV`vjv)_L^@PEtahNe{o=+d;cgw28Av`%p+n;ES)bu>t-M@d8eny02w~|
zsf_@I^92zBUylHR8&wmfQX`^BGGWofmXc5n3XRfgq*}c`ufL<c!{6?2_qDZlcC~f3
zE1ey_cE#)UB530P7D)9d)(tgAO?V;)3weopy8D%_p&bLeVNnhi3sM8u#FUh3X|P0^
z&;6qxuh%o2iosIuM5B64v!JQpQ76@*;*cIi^^^U1C-)iGqiWh0Q4Lro!*VQcGZyzi
z-^DyivA9zU@E1#Q>Q2ecneppK1`%K>p2FhB(o3uKse3!CU~<aI>>_deaSeZ&@M+eC
zshfn+rTrgn<scCIYV9CrZvWcF7!<B}+~PV>k0JYXBL*!pJcD6BD~P1Htl5p0*~XBu
z4O7L389d~<hYp$9L5^KLnE#*9{g4n%e2EV_JaY!NnvHt3j4Yn#Wmt*vtipK4T8_EV
zCfdd3$^-+1T)Rd%b8hc08v(}i0t$~iN8OwzE$RdZ#l<%W17y5=oG~bT@$t$?GbSHC
z9+8qt>Iut))j4>ndikblHZj6~CMK*W;AM%mJCQMzlw~SIRK=}pg7q1mqMpF(nTX&(
zu9e}H9IUGjU@V=?r0_C>Cs@OE*ch5t8M8j<kmjPpa5#AwyfU;ts-kG;Yqp~N*0ViI
zTn*DW_|OgOIH&wk@XOF9h<0bog$ZOuyXI0YVW9NWkrjCcNK@l!&Jf4-Ce8%zeRc&c
zs2RWNv=LCA>Fgm2`fbN^9^^k=Cd|L1e|c?2m6qTp%uF$}iHI^=icT3e<e*E?Fneu1
zelX2fv7;aS=CxNf>=DU{^2wXSNBcx_wsF_pO^r1o8NKVz_U!(XYM7UXxx9w*j<McB
zJU}GguCo&i1U~=eLXk6zFW$;?=6<@z=HQ<zF2aD_@UxQvs$t|>NAPzHlf}=%oQ-f}
z4mN3vWOzLX0fB`_ga~#xmGwlDY3eaDaZ6XzX*KIIBWhetyP&&jcSWsiin?I4REtiB
zHbBql$#-E(2zrI!9H|Dmjj(lKV4!ab7yk-6ED*yHC6bKNi0PsU23)X>&tv6GrgFBm
za;AJaMqoRCn&PUZCt9Je0KpKvQGoss3~1e2uU6gBT>hbau{>J8vV5Gt4h>~*hG1s_
zLLt~SEo13czNq9Y-&Ve^tK>}ryP0lJ2;M5d-VjLBI`%X!Dvz~P#^O861p;p~)xHqy
zFTjBi$TPio2j|M$w%JCx7`XT4_k#?OZKn%v4wi23CJY2_-zw)h<M?ECku!gP7a|CF
zFRIso;^UuId{Izj7{nf@=IpbXl$==G%ccA|j?xrfMRn#7X(4K+CtcCa5fvv9!YBD#
zKSJZlh;C&e!pogeP3(9B>J4hCdSWOE(Hu~LUl#V?Yf=k_s*k>!HIfkHmJymZ^<)Ax
je$Ngui!##cX#<`2iTP0;Xofju7*T5ZqgB<xOvdvsqJugT

literal 7000
zcmb`M2Ut|sw#Ns=hS(Al%hhNUlq8B2QM^%Sf?Xowu}zGKGsEDFftgun4pJ0FfkA?Z
zAfRHytJr%$RMfd|t@qw7ulL@2@4ei&)}Hb-3Bi0{jvs5EbJjj<?f-A@*?XU3b!<ME
zsvsuWI?`k^8SWCN->{YVAlBh2E%8;PIJ~+urKGgjaNBg<RuS)a*otj>JPZl>;|q-n
zpEDlftg-yKI}3(39I1z48Vpa)8Dfp$drM$MZf@?nAMv3~j>&+Lc1MB3Tj+H7<DDKk
ziHCTVpX*zwd;3rmjIs{ni(;GM_M|~V07h#tCIDkK7-t<SnM!QCoiM%%UJEr`T^G0u
z%i;BPmh7=6YkZ_qfzM{RV1jij-!OEW+vE1^o*$~7IzKH{9VtG~Zg|5wD*BAyW%D`V
z%>ft`=uU^%<2UqDhY=4G!z4U~Awn=I$5aQCUGYl39j3Ur`QOrDYCTNT;H~7Gn4H);
znC=QATL3cz+1nb-WMr-{!kT4o;8@Z-CRUEs!%Zegl$c2x%r;<-y;L>a=XHCG0!Oj8
zv=EZ3;GJY^0zbi>SWx7&8Kt_@5AWs-vySGQd!5DJ!U~Twgyuc#G`<%_6LCh_*xeqO
zTdJsY-@ENFuL@F<Re$HD!hF|6mlB-nVu5KIq>I3~oVWmFge4Yb!2*$(slmc8m$*nt
zJkHJ7gHH`SF=-xlS@=pF+vwJO?4Lsqu*nj>6)j$<jvdRfv6!w-z}hcVi`G{sbkF?q
zje)MXB4i2a#TtCufcNbo(P^+GB6_KaUZ#OLD%#>&DBac0F|mh^?8WDFVc836FpxzZ
zR9Av$@tEcFiC=oc?WKEIx0cRh%@-=Z{NUFT^nnP<)?m2-E5d@(VP!<nDiO3=gEdh>
zYeRxsIVCoHu#39s_=*?xoR}216^~Ktm-Nny^|5Qpd^(p%-QR5YtP?dk8hmKL`mmY|
z$c?DkAZqe7*cer#h14{&8fvD#O?1h*-~}~H=TaMX(iQ)gbk2+Q%SR<9AD}12-(<&Z
z5-FQC*kZueu#^R`Eh5D#Qa;jPdsNEDAt^1=F?DovBR<-h`hpaCazK6^dD#4gM!r}b
z8>p4eQ`acxH!5a_h{@MrrvU|aD^FP_EmfvX19n9e*+h|D14mR*VMtLe*A5+DXJAL3
z`RelPL<FaZDAHiJ0WQ0B3`gFq7s&irV0a50wi0khwCoWrpJ-4V)lw4D(!&qoJu6*L
zz?Raz=2g^Z=O6TG3esH+4#9JHdGj=C!fJC8K3_?<@xm;;j=@}d^fBG18tTJZ^N#V@
zij7!@cd;6S>DW%E_k9=&@YZC!Nlkp-XtFHip1gurbn%$!c2dH|Bx>4gPD-XHo8F)<
z4wN@#&jf5upvM>-6cTtuyH^9B0l+Iv96#an`kkOhB>P3Op+RX>^4^f-2O_zV8x|;<
zkG0a6YgD^v>aceyV8+|jMYSv}fb~8zJ;lR(=Mv9_&Dd>mrTDi|Fo}J^kMlQD4<5n8
z{P<ovyOl04qchl!!K8|98z$ptc*}yl=Jn%>_8i3EhVe5d;dyGtHuHqZbR&5@J?F=;
z6HkZ!(c?_Z;DXCUaJdE*22|SBYG?C!ynfhM1^bhgevhrh2~`pO2Soot4XQ<dOdT8w
zuXbB9;IOQAH5$}LR=b=8UJ*waP^Z>8S(lgdpN;=?{)2#|uGip*0S)%hK1?t5IBbTq
z5I&9MYm|IPHE0UwI~Jlnf&)YjXqG_7HE1!Q)ovA+*WoBGE#x>SB5~RzPP+yj;W#HF
z8<8wHCG9?~L1(1hFYnsUsMf!f@T?>}r@?sxE`*^K`&>5Yie$Se*}kK}rEs>(p;oKi
zjB0iwRYSK#xuU^U1FqQ>YP;QA4%Z_=Zb*=u8r)JrWNUFd6lP9DIhRFpBw(Fo4PC69
zfOoKddE~ST0~WX=N$zTJ&w%@OwLL84&92Aq=A9foh@|>VQuS!i8%i}K01rc{W^f8v
z8c!$jl5f0a2R74T>P_B3eP+(JT2h)U&?h+_Y4F&9Cn5gDUhYOt0sWB-PbI@M4W36c
z1Vb67M8sgVg{ngrah41@Uf*be?@H_+(*O;iQ2TYSp<)+Qz>i0w|Aa*UNe#Xi=KcLp
zY#DN)wl9sxJ`BPSB-#%(_$dQ^I@ED)Poc9MekKy=XC=_jY4G#mK)>LcDf=3k2jqZX
zlmNe^!7m%|E1@yFTlbdwcnj`_UyTI$H3{<T8vI5$$Zviw$ZtuI-`3!F4EWtp5S~??
zx{ag2??r<Az6ALL4gN43<c~s<UK)pgEP?(+gFiLk&mx^$ROEK>drJlUc_h+bNTk2i
z;IG1w{`zyx`5TGyw;KGN0e^3|j$@g!eO4Q5e}UUm!0W(2L}LA;#QG-<{y7}$U;aPV
zSmE=p66@bI_;&;TgXd3qnemr7ojyPOXFmKFZ_xic+4U7W@2u4HMJclL{vTJYYlv%j
z<P9MDe)PW*|3{i`;&-fo9>Z4CW7)bQ6SO`=uBO)OamqK8tu$43=K3&kseNiMw~H?1
zl;NR#JS4MbAFn%g)Z*A9vvud@JOeKb+hg9ogsSuKHa77zO)VOA;HMNUl9@WeiXGU3
zj}FR2A6ShS%W|<+W@<d~Hub)*hvw=|4>nQ_UZ#ut*z=GMXW>mJuYPT2>f4C79JAGw
zovn74GpYX-=4*YpXh;a?Bb2<6Yz;l0t-Z8LAI0Thcozy@J%MX}IbnUYxH!QWE;W5D
z+kie!osSn!l=EwH#hhPPz6orjoZk?a;w+0I?~zZer53t-fLZapN0+F1JziwcqfL01
z2?X+Xneh<b+3Cc`yn=RH#=cD#@ILnN>c*(CGl4#xWU1n(r>CzjN^uM3g_-YA8y!`c
z=?Xn~4X@_BSB62lOkH#7J};~fneu**mY=2=M`kX1`YMp$6ama&qS7{rt)Wk5E0Cvf
zIT++j1^HWCa{$Ow#l;DxajEHVu?^_c)%gtZL?OQ|R}6Wk^37r!g`6lZ1^FIB=3P)9
zZx^s<zl`R)TbIqJ<GjNSm{WHr(h<hgo^Pg8)IXg!Vb$2ckbQK2+{C$DixKalYj`1-
z&R6oo^XLh7#U5P4<2$&NA3SY(hkN3d<weS#)pT?>-#vxx<^v<AQ~xLhId1EidH8^y
zy$a$aX+9H}t(47SYv{>r1@Svv4u*JvLi{e*902iq;^G8zxzzM|Yy*0VI-f6|DB@JP
zV#I06m(DhdI73_tvD(i*&cQ2GkBvfUDt3>JrQ0!7&vOShZrh3XHx-#zCE(MY{DZG&
zzWeAb_D?$~<D19u9XwizkML5tbk8w5OBV~>^iUY{zpL2B+rB^ubKk^KmD8$tzMvl7
z7@vt*|0Z^rljE}K9B(L}@#B6Mw$SbU6nquT3q%4_$W-bUvNiNYYz1=`mxE!>P?#5U
z%>gifTU?yreJ(Y93EP0aRGlvqPZYCRt{AgL`95G9#hfj!=twpt(M_Em3Mjfh93LgF
zNo2?iu?COM&nR)5i?+~-LORSZ20SMpnmCQll+EB?305+(lhjGi_X^@5-lMKeeCWfI
z7|f>Hj0}EDkYS89ONfUk9r9OUFq2xg6jMVb6Xp)MR=JCR4uTl3Q6Og#HpDEn@NerY
z4t}rFmrL`RzzU^oC0j#Z#a0ln=5jE^=?d{0t~mhWwc_Fg>$ueP9JT@dLv_AhJW<5C
za>a-@C|@4iDB_LcQi$)cnjLF+FrCUPW2F}ArMBVNw1yd*@Ko7oyfdP}oK3gNKf>qC
zyxYQ9AJK!I*s)S5)2V6vvbM5RAf2SEFqywk)e7TXqv%T2D(sb+q66#R$IEVfR*u2Z
zOQ-9Z{1fOfyQynW44nu84u4XKeHFl(G@kWuQo1&?HS{fP1@Klb2Lqg@0B_@(0|1sU
zmXhEjE;W5S+kpPDI^Q9lDByg#V!%6<uYhe7@Gfyh0aNEj?4_ECG8D6>Ca0SnjCTgV
z^BuyTiTEVbym2vhkK~Et5Cit`-8yXY+Jx_TYS=KHf_{FmET5h&v6cwj{0|q8#A^yJ
zzYNz*!XtFg(0PzwG)m|3+u|b`z<r6lJL%_NfQ(@N<}{3>w-0$0z&2?->$fXi4z`9~
z$W{P5xf~2|ssdcZH3tB^TU?yL#igdZ*#`7I>iiS&L;)Ae6$36&J`dX{V6V6o;Hx}>
zT|Dg0=?p=nUK%rR(vba3IF(u}u%GVjFq;|4<#hhSaPuG&c8nSQR$`^#o$bjRO*eFT
zs+mrwOwAfAcvH$p?2qRS!!64Wg95Y4=`^!-@ceyyjS1VB?U;mYo{evYhHelq$Tz*X
zS0U?@R<m$Wf^@cq?q@4x4K4>Eo3>byE#;a6kliaTPEf|BrkAq~=oRX`Qan+z`{atr
z?pMAlwo$SN#HGl7#$=Zi$6MD5RvuPYHU7GAiMJN@=0x9YdawdJc9}onueBFu;<+lB
ze~<E4jkDD0#emSo-VD68b*A8*LhabToF32NJwgy${U6Yc5=EF!4&`(HT63d<XOm~t
z@5-W{9eC1A{n+E4%)!<0m7=Yjt`B<^$_GUOL#S5b4zV@#!)%3e4VQya&Qg?Xx#j?r
z>%_$g0$gf(J==hOM4dN?CrbHKxnjzV%6F7)lyZ}}6y+yO8PDdzUOXm0bN<|O3_H^3
zD$l3&=GQ4OhTj=ls1wZm-q1*Oyf0|nC#;{*i6t@xV5?$Xk7sy|d%PVaMiXzV+eYB^
z1^nqJxCKuwp3A##8PGf-JYKgde;j{t3S!edylLY!pE2%PSeK36K1F+l`X2=9ANeY<
YkBJaw(X2EcXKUy!Z0)7aQd`RZ0ZByBZU6uP

diff --git a/tests/test_datasets/test_waymo_dataset.py b/tests/test_datasets/test_waymo_dataset.py
new file mode 100644
index 0000000000..20ec1fc173
--- /dev/null
+++ b/tests/test_datasets/test_waymo_dataset.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import torch
+from mmcv.transforms.base import BaseTransform
+from mmengine.registry import TRANSFORMS
+from mmengine.structures import InstanceData
+
+from mmdet3d.datasets import WaymoDataset
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+
+
+def _generate_waymo_dataset_config():
+    data_root = 'tests/data/waymo/kitti_format'
+    ann_file = 'waymo_infos_train.pkl'
+    classes = ['Car', 'Pedestrian', 'Cyclist']
+    # wait for pipline refactor
+
+    if 'Identity' not in TRANSFORMS:
+
+        @TRANSFORMS.register_module()
+        class Identity(BaseTransform):
+
+            def transform(self, info):
+                if 'ann_info' in info:
+                    info['gt_labels_3d'] = info['ann_info']['gt_labels_3d']
+                data_sample = Det3DDataSample()
+                gt_instances_3d = InstanceData()
+                gt_instances_3d.labels_3d = info['gt_labels_3d']
+                data_sample.gt_instances_3d = gt_instances_3d
+                info['data_samples'] = data_sample
+                return info
+
+    pipeline = [
+        dict(type='Identity'),
+    ]
+
+    modality = dict(use_lidar=True, use_camera=True)
+    data_prefix = data_prefix = dict(
+        pts='training/velodyne', CAM_FRONT='training/image_0')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+def test_getitem():
+    data_root, ann_file, classes, data_prefix, \
+        pipeline, modality, = _generate_waymo_dataset_config()
+
+    waymo_dataset = WaymoDataset(
+        data_root,
+        ann_file,
+        data_prefix=data_prefix,
+        pipeline=pipeline,
+        metainfo=dict(classes=classes),
+        modality=modality)
+
+    waymo_dataset.prepare_data(0)
+    input_dict = waymo_dataset.get_data_info(0)
+    waymo_dataset[0]
+    # assert the the path should contains data_prefix and data_root
+    assert data_prefix['pts'] in input_dict['lidar_points']['lidar_path']
+    assert data_root in input_dict['lidar_points']['lidar_path']
+    for cam_id, img_info in input_dict['images'].items():
+        if 'img_path' in img_info:
+            assert data_prefix['CAM_FRONT'] in img_info['img_path']
+            assert data_root in img_info['img_path']
+
+    ann_info = waymo_dataset.parse_ann_info(input_dict)
+
+    # only one instance
+    assert 'gt_labels_3d' in ann_info
+    assert ann_info['gt_labels_3d'].dtype == np.int64
+
+    assert 'gt_bboxes_3d' in ann_info
+    assert isinstance(ann_info['gt_bboxes_3d'], LiDARInstance3DBoxes)
+    assert torch.allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
+                          torch.tensor(43.3103))
+    assert 'centers_2d' in ann_info
+    assert ann_info['centers_2d'].dtype == np.float32
+    assert 'depths' in ann_info
+    assert ann_info['depths'].dtype == np.float32
diff --git a/tools/create_data.py b/tools/create_data.py
index 34356c2a8f..d8c6495d6a 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -2,6 +2,8 @@
 import argparse
 from os import path as osp
 
+from mmengine import print_log
+
 from tools.dataset_converters import indoor_converter as indoor
 from tools.dataset_converters import kitti_converter as kitti
 from tools.dataset_converters import lyft_converter as lyft_converter
@@ -171,8 +173,19 @@ def waymo_data_prep(root_path,
                     version,
                     out_dir,
                     workers,
-                    max_sweeps=5):
-    """Prepare the info file for waymo dataset.
+                    max_sweeps=10,
+                    only_gt_database=False,
+                    save_senor_data=False,
+                    skip_cam_instances_infos=False):
+    """Prepare waymo dataset. There are 3 steps as follows:
+
+    Step 1. Extract camera images and lidar point clouds from waymo raw
+        data in '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
+    Step 3. Generate waymo ground truth database (point clouds within
+        each 3D bounding box) for data augmentation in training.
+    Steps 1 and 2 will be done in Waymo2KITTI, and step 3 will be done in
+    GTDatabaseCreater.
 
     Args:
         root_path (str): Path of dataset root.
@@ -180,44 +193,55 @@ def waymo_data_prep(root_path,
         out_dir (str): Output directory of the generated info file.
         workers (int): Number of threads to be used.
         max_sweeps (int, optional): Number of input consecutive frames.
-            Default: 5. Here we store pose information of these frames
-            for later use.
+            Default to 10. Here we store ego2global information of these
+            frames for later use.
+        only_gt_database (bool, optional): Whether to only generate ground
+            truth database. Default to False.
+        save_senor_data (bool, optional): Whether to skip saving
+            image and lidar. Default to False.
+        skip_cam_instances_infos (bool, optional): Whether to skip
+            gathering cam_instances infos in Step 2. Default to False.
     """
     from tools.dataset_converters import waymo_converter as waymo
 
-    splits = [
-        'training', 'validation', 'testing', 'testing_3d_camera_only_detection'
-    ]
-    for i, split in enumerate(splits):
-        load_dir = osp.join(root_path, 'waymo_format', split)
-        if split == 'validation':
-            save_dir = osp.join(out_dir, 'kitti_format', 'training')
-        else:
-            save_dir = osp.join(out_dir, 'kitti_format', split)
-        converter = waymo.Waymo2KITTI(
-            load_dir,
-            save_dir,
-            prefix=str(i),
-            workers=workers,
-            test_mode=(split
-                       in ['testing', 'testing_3d_camera_only_detection']))
-        converter.convert()
-
-    from tools.dataset_converters.waymo_converter import \
-        create_ImageSets_img_ids
-    create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
-    # Generate waymo infos
+    if version == 'v1.4':
+        splits = [
+            'training', 'validation', 'testing',
+            'testing_3d_camera_only_detection'
+        ]
+    elif version == 'v1.4-mini':
+        splits = ['training', 'validation']
+    else:
+        raise NotImplementedError(f'Unsupported Waymo version {version}!')
     out_dir = osp.join(out_dir, 'kitti_format')
-    kitti.create_waymo_info_file(
-        out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
-    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
-    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
-    info_trainval_path = osp.join(out_dir, f'{info_prefix}_infos_trainval.pkl')
-    info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_train_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_val_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_trainval_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_test_path)
+
+    if not only_gt_database:
+        for i, split in enumerate(splits):
+            load_dir = osp.join(root_path, 'waymo_format', split)
+            if split == 'validation':
+                save_dir = osp.join(out_dir, 'training')
+            else:
+                save_dir = osp.join(out_dir, split)
+            converter = waymo.Waymo2KITTI(
+                load_dir,
+                save_dir,
+                prefix=str(i),
+                workers=workers,
+                test_mode=(split
+                           in ['testing', 'testing_3d_camera_only_detection']),
+                info_prefix=info_prefix,
+                max_sweeps=max_sweeps,
+                split=split,
+                save_senor_data=save_senor_data,
+                save_cam_instances=not skip_cam_instances_infos)
+            converter.convert()
+            if split == 'validation':
+                converter.merge_trainval_infos()
+
+        from tools.dataset_converters.waymo_converter import \
+            create_ImageSets_img_ids
+        create_ImageSets_img_ids(out_dir, splits)
+
     GTDatabaseCreater(
         'WaymoDataset',
         out_dir,
@@ -227,6 +251,8 @@ def waymo_data_prep(root_path,
         with_mask=False,
         num_worker=workers).create()
 
+    print_log('Successfully preparing Waymo Open Dataset')
+
 
 def semantickitti_data_prep(info_prefix, out_dir):
     """Prepare the info file for SemanticKITTI dataset.
@@ -274,12 +300,23 @@ def semantickitti_data_prep(info_prefix, out_dir):
 parser.add_argument(
     '--only-gt-database',
     action='store_true',
-    help='Whether to only generate ground truth database.')
+    help='''Whether to only generate ground truth database.
+        Only used when dataset is NuScenes or Waymo!''')
+parser.add_argument(
+    '--skip-cam_instances-infos',
+    action='store_true',
+    help='''Whether to skip gathering cam_instances infos.
+        Only used when dataset is Waymo!''')
+parser.add_argument(
+    '--skip-saving-sensor-data',
+    action='store_true',
+    help='''Whether to skip saving image and lidar.
+        Only used when dataset is Waymo!''')
 args = parser.parse_args()
 
 if __name__ == '__main__':
-    from mmdet3d.utils import register_all_modules
-    register_all_modules()
+    from mmengine.registry import init_default_scope
+    init_default_scope('mmdet3d')
 
     if args.dataset == 'kitti':
         if args.only_gt_database:
@@ -334,6 +371,17 @@ def semantickitti_data_prep(info_prefix, out_dir):
                 dataset_name='NuScenesDataset',
                 out_dir=args.out_dir,
                 max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps,
+            only_gt_database=args.only_gt_database,
+            save_senor_data=not args.skip_saving_sensor_data,
+            skip_cam_instances_infos=args.skip_cam_instances_infos)
     elif args.dataset == 'lyft':
         train_version = f'{args.version}-train'
         lyft_data_prep(
@@ -347,14 +395,6 @@ def semantickitti_data_prep(info_prefix, out_dir):
             info_prefix=args.extra_tag,
             version=test_version,
             max_sweeps=args.max_sweeps)
-    elif args.dataset == 'waymo':
-        waymo_data_prep(
-            root_path=args.root_path,
-            info_prefix=args.extra_tag,
-            version=args.version,
-            out_dir=args.out_dir,
-            workers=args.workers,
-            max_sweeps=args.max_sweeps)
     elif args.dataset == 'scannet':
         scannet_data_prep(
             root_path=args.root_path,
diff --git a/tools/create_data.sh b/tools/create_data.sh
index 9a57852f71..0a1946585d 100755
--- a/tools/create_data.sh
+++ b/tools/create_data.sh
@@ -6,10 +6,11 @@ export PYTHONPATH=`pwd`:$PYTHONPATH
 PARTITION=$1
 JOB_NAME=$2
 DATASET=$3
+WORKERS=$4
 GPUS=${GPUS:-1}
 GPUS_PER_NODE=${GPUS_PER_NODE:-1}
 SRUN_ARGS=${SRUN_ARGS:-""}
-JOB_NAME=create_data
+PY_ARGS=${@:5}
 
 srun -p ${PARTITION} \
     --job-name=${JOB_NAME} \
@@ -21,4 +22,6 @@ srun -p ${PARTITION} \
     python -u tools/create_data.py ${DATASET} \
             --root-path ./data/${DATASET} \
             --out-dir ./data/${DATASET} \
-            --extra-tag ${DATASET}
+            --workers ${WORKERS} \
+            --extra-tag ${DATASET} \
+            ${PY_ARGS}
diff --git a/tools/dataset_converters/create_gt_database.py b/tools/dataset_converters/create_gt_database.py
index ae452eb543..fb84256fd8 100644
--- a/tools/dataset_converters/create_gt_database.py
+++ b/tools/dataset_converters/create_gt_database.py
@@ -7,7 +7,7 @@
 import numpy as np
 from mmcv.ops import roi_align
 from mmdet.evaluation import bbox_overlaps
-from mmengine import track_iter_progress
+from mmengine import print_log, track_iter_progress
 from pycocotools import mask as maskUtils
 from pycocotools.coco import COCO
 
@@ -504,7 +504,9 @@ def create_single(self, input_dict):
         return single_db_infos
 
     def create(self):
-        print(f'Create GT Database of {self.dataset_class_name}')
+        print_log(
+            f'Create GT Database of {self.dataset_class_name}',
+            logger='current')
         dataset_cfg = dict(
             type=self.dataset_class_name,
             data_root=self.data_path,
@@ -610,12 +612,19 @@ def loop_dataset(i):
             input_dict['box_mode_3d'] = self.dataset.box_mode_3d
             return input_dict
 
-        multi_db_infos = mmengine.track_parallel_progress(
-            self.create_single,
-            ((loop_dataset(i)
-              for i in range(len(self.dataset))), len(self.dataset)),
-            self.num_worker)
-        print('Make global unique group id')
+        if self.num_worker == 0:
+            multi_db_infos = mmengine.track_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)))
+        else:
+            multi_db_infos = mmengine.track_parallel_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)),
+                self.num_worker,
+                chunksize=1000)
+        print_log('Make global unique group id', logger='current')
         group_counter_offset = 0
         all_db_infos = dict()
         for single_db_infos in track_iter_progress(multi_db_infos):
@@ -630,7 +639,8 @@ def loop_dataset(i):
             group_counter_offset += (group_id + 1)
 
         for k, v in all_db_infos.items():
-            print(f'load {len(v)} {k} database infos')
+            print_log(f'load {len(v)} {k} database infos', logger='current')
 
+        print_log(f'Saving GT database infos into {self.db_info_save_path}')
         with open(self.db_info_save_path, 'wb') as f:
             pickle.dump(all_db_infos, f)
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index 87f9c54b54..e383c238df 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -9,23 +9,33 @@
     raise ImportError('Please run "pip install waymo-open-dataset-tf-2-6-0" '
                       '>1.4.5 to install the official devkit first.')
 
+import copy
 import os
+import os.path as osp
 from glob import glob
+from io import BytesIO
 from os.path import exists, join
 
 import mmengine
 import numpy as np
 import tensorflow as tf
+from mmengine import print_log
+from nuscenes.utils.geometry_utils import view_points
+from PIL import Image
 from waymo_open_dataset.utils import range_image_utils, transform_utils
 from waymo_open_dataset.utils.frame_utils import \
     parse_range_image_and_camera_projection
 
+from mmdet3d.datasets.convert_utils import post_process_coords
+from mmdet3d.structures import Box3DMode, LiDARInstance3DBoxes, points_cam2img
+
 
 class Waymo2KITTI(object):
-    """Waymo to KITTI converter.
+    """Waymo to KITTI converter. There are 2 steps as follows:
 
-    This class serves as the converter to change the waymo raw data to KITTI
-    format.
+    Step 1. Extract camera images and lidar point clouds from waymo raw data in
+        '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
 
     Args:
         load_dir (str): Directory to load waymo raw data.
@@ -36,8 +46,16 @@ class Waymo2KITTI(object):
             Defaults to 64.
         test_mode (bool, optional): Whether in the test_mode.
             Defaults to False.
-        save_cam_sync_labels (bool, optional): Whether to save cam sync labels.
-            Defaults to True.
+        save_senor_data (bool, optional): Whether to save image and lidar
+            data. Defaults to True.
+        save_cam_sync_instances (bool, optional): Whether to save cam sync
+            instances. Defaults to True.
+        save_cam_instances (bool, optional): Whether to save cam instances.
+            Defaults to False.
+        info_prefix (str, optional): Prefix of info filename.
+            Defaults to 'waymo'.
+        max_sweeps (int, optional): Max length of sweeps. Defaults to 10.
+        split (str, optional): Split of the data. Defaults to 'training'.
     """
 
     def __init__(self,
@@ -46,18 +64,12 @@ def __init__(self,
                  prefix,
                  workers=64,
                  test_mode=False,
-                 save_cam_sync_labels=True):
-        self.filter_empty_3dboxes = True
-        self.filter_no_label_zone_points = True
-
-        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
-
-        # Only data collected in specific locations will be converted
-        # If set None, this filter is disabled
-        # Available options: location_sf (main dataset)
-        self.selected_waymo_locations = None
-        self.save_track_id = False
-
+                 save_senor_data=True,
+                 save_cam_sync_instances=True,
+                 save_cam_instances=True,
+                 info_prefix='waymo',
+                 max_sweeps=10,
+                 split='training'):
         # turn on eager execution for older tensorflow versions
         if int(tf.__version__.split('.')[0]) < 2:
             tf.enable_eager_execution()
@@ -74,12 +86,21 @@ def __init__(self,
         self.type_list = [
             'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
         ]
-        self.waymo_to_kitti_class_map = {
-            'UNKNOWN': 'DontCare',
-            'PEDESTRIAN': 'Pedestrian',
-            'VEHICLE': 'Car',
-            'CYCLIST': 'Cyclist',
-            'SIGN': 'Sign'  # not in kitti
+
+        # MMDetection3D unified camera keys & class names
+        self.camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_LEFT',
+            'CAM_FRONT_RIGHT',
+            'CAM_SIDE_LEFT',
+            'CAM_SIDE_RIGHT',
+        ]
+        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+        self.info_map = {
+            'training': '_infos_train.pkl',
+            'validation': '_infos_val.pkl',
+            'testing': '_infos_test.pkl',
+            'testing_3d_camera_only_detection': '_infos_test_cam_only.pkl'
         }
 
         self.load_dir = load_dir
@@ -87,61 +108,87 @@ def __init__(self,
         self.prefix = prefix
         self.workers = int(workers)
         self.test_mode = test_mode
-        self.save_cam_sync_labels = save_cam_sync_labels
+        self.save_senor_data = save_senor_data
+        self.save_cam_sync_instances = save_cam_sync_instances
+        self.save_cam_instances = save_cam_instances
+        self.info_prefix = info_prefix
+        self.max_sweeps = max_sweeps
+        self.split = split
+
+        # TODO: Discuss filter_empty_3dboxes and filter_no_label_zone_points
+        self.filter_empty_3dboxes = True
+        self.filter_no_label_zone_points = True
+        self.save_track_id = False
 
         self.tfrecord_pathnames = sorted(
             glob(join(self.load_dir, '*.tfrecord')))
 
-        self.label_save_dir = f'{self.save_dir}/label_'
-        self.label_all_save_dir = f'{self.save_dir}/label_all'
         self.image_save_dir = f'{self.save_dir}/image_'
-        self.calib_save_dir = f'{self.save_dir}/calib'
         self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
-        self.pose_save_dir = f'{self.save_dir}/pose'
-        self.timestamp_save_dir = f'{self.save_dir}/timestamp'
-        if self.save_cam_sync_labels:
-            self.cam_sync_label_save_dir = f'{self.save_dir}/cam_sync_label_'
-            self.cam_sync_label_all_save_dir = \
-                f'{self.save_dir}/cam_sync_label_all'
 
-        self.create_folder()
+        # Create folder for saving KITTI format camera images and
+        # lidar point clouds.
+        if 'testing_3d_camera_only_detection' not in self.load_dir:
+            mmengine.mkdir_or_exist(self.point_cloud_save_dir)
+        for i in range(5):
+            mmengine.mkdir_or_exist(f'{self.image_save_dir}{str(i)}')
 
     def convert(self):
         """Convert action."""
-        print('Start converting ...')
-        mmengine.track_parallel_progress(self.convert_one, range(len(self)),
-                                         self.workers)
-        print('\nFinished ...')
+        print_log(f'Start converting {self.split} dataset', logger='current')
+        if self.workers == 0:
+            data_infos = mmengine.track_progress(self.convert_one,
+                                                 range(len(self)))
+        else:
+            data_infos = mmengine.track_parallel_progress(
+                self.convert_one, range(len(self)), self.workers)
+        data_list = []
+        for data_info in data_infos:
+            data_list.extend(data_info)
+        metainfo = dict()
+        metainfo['dataset'] = 'waymo'
+        metainfo['version'] = 'waymo_v1.4'
+        metainfo['info_version'] = 'mmdet3d_v1.4'
+        waymo_infos = dict(data_list=data_list, metainfo=metainfo)
+        filenames = osp.join(
+            osp.dirname(self.save_dir),
+            f'{self.info_prefix + self.info_map[self.split]}')
+        print_log(f'Saving {self.split} dataset infos into {filenames}')
+        mmengine.dump(waymo_infos, filenames)
 
     def convert_one(self, file_idx):
-        """Convert action for single file.
+        """Convert one '*.tfrecord' file to kitti format. Each file stores all
+        the frames (about 200 frames) in current scene. We treat each frame as
+        a sample, save their images and point clouds in kitti format, and then
+        create info for all frames.
 
         Args:
             file_idx (int): Index of the file to be converted.
+
+        Returns:
+            List[dict]: Waymo infos for all frames in current file.
         """
         pathname = self.tfrecord_pathnames[file_idx]
         dataset = tf.data.TFRecordDataset(pathname, compression_type='')
 
+        # NOTE: file_infos is not shared between processes, only stores frame
+        # infos within the current file.
+        file_infos = []
         for frame_idx, data in enumerate(dataset):
 
             frame = dataset_pb2.Frame()
             frame.ParseFromString(bytearray(data.numpy()))
-            if (self.selected_waymo_locations is not None
-                    and frame.context.stats.location
-                    not in self.selected_waymo_locations):
-                continue
 
-            self.save_image(frame, file_idx, frame_idx)
-            self.save_calib(frame, file_idx, frame_idx)
-            self.save_lidar(frame, file_idx, frame_idx)
-            self.save_pose(frame, file_idx, frame_idx)
-            self.save_timestamp(frame, file_idx, frame_idx)
+            # Step 1. Extract camera images and lidar point clouds from waymo
+            # raw data in '*.tfreord' and save as kitti format.
+            if self.save_senor_data:
+                self.save_image(frame, file_idx, frame_idx)
+                self.save_lidar(frame, file_idx, frame_idx)
 
-            if not self.test_mode:
-                # TODO save the depth image for waymo challenge solution.
-                self.save_label(frame, file_idx, frame_idx)
-                if self.save_cam_sync_labels:
-                    self.save_label(frame, file_idx, frame_idx, cam_sync=True)
+            # Step 2. Generate waymo train/val/test infos and save as pkl file.
+            # TODO save the depth image for waymo challenge solution.
+            self.create_waymo_info_file(frame, file_idx, frame_idx, file_infos)
+        return file_infos
 
     def __len__(self):
         """Length of the filename list."""
@@ -162,62 +209,6 @@ def save_image(self, frame, file_idx, frame_idx):
             with open(img_path, 'wb') as fp:
                 fp.write(img.image)
 
-    def save_calib(self, frame, file_idx, frame_idx):
-        """Parse and save the calibration data.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        # waymo front camera to kitti reference camera
-        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
-                                       [1.0, 0.0, 0.0]])
-        camera_calibs = []
-        R0_rect = [f'{i:e}' for i in np.eye(3).flatten()]
-        Tr_velo_to_cams = []
-        calib_context = ''
-
-        for camera in frame.context.camera_calibrations:
-            # extrinsic parameters
-            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
-                4, 4)
-            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
-            Tr_velo_to_cam = \
-                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
-            if camera.name == 1:  # FRONT = 1, see dataset.proto for details
-                self.T_velo_to_front_cam = Tr_velo_to_cam.copy()
-            Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, ))
-            Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam])
-
-            # intrinsic parameters
-            camera_calib = np.zeros((3, 4))
-            camera_calib[0, 0] = camera.intrinsic[0]
-            camera_calib[1, 1] = camera.intrinsic[1]
-            camera_calib[0, 2] = camera.intrinsic[2]
-            camera_calib[1, 2] = camera.intrinsic[3]
-            camera_calib[2, 2] = 1
-            camera_calib = list(camera_calib.reshape(12))
-            camera_calib = [f'{i:e}' for i in camera_calib]
-            camera_calibs.append(camera_calib)
-
-        # all camera ids are saved as id-1 in the result because
-        # camera 0 is unknown in the proto
-        for i in range(5):
-            calib_context += 'P' + str(i) + ': ' + \
-                ' '.join(camera_calibs[i]) + '\n'
-        calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n'
-        for i in range(5):
-            calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \
-                ' '.join(Tr_velo_to_cams[i]) + '\n'
-
-        with open(
-                f'{self.calib_save_dir}/{self.prefix}' +
-                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt',
-                'w+') as fp_calib:
-            fp_calib.write(calib_context)
-            fp_calib.close()
-
     def save_lidar(self, frame, file_idx, frame_idx):
         """Parse and save the lidar data in psd format.
 
@@ -275,194 +266,6 @@ def save_lidar(self, frame, file_idx, frame_idx):
             f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
         point_cloud.astype(np.float32).tofile(pc_path)
 
-    def save_label(self, frame, file_idx, frame_idx, cam_sync=False):
-        """Parse and save the label data in txt format.
-        The relation between waymo and kitti coordinates is noteworthy:
-        1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti)
-        2. x-y-z: front-left-up (waymo) -> right-down-front(kitti)
-        3. bbox origin at volumetric center (waymo) -> bottom center (kitti)
-        4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo)
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-            cam_sync (bool, optional): Whether to save the cam sync labels.
-                Defaults to False.
-        """
-        label_all_path = f'{self.label_all_save_dir}/{self.prefix}' + \
-            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-        if cam_sync:
-            label_all_path = label_all_path.replace('label_',
-                                                    'cam_sync_label_')
-        fp_label_all = open(label_all_path, 'w+')
-        id_to_bbox = dict()
-        id_to_name = dict()
-        for labels in frame.projected_lidar_labels:
-            name = labels.name
-            for label in labels.labels:
-                # TODO: need a workaround as bbox may not belong to front cam
-                bbox = [
-                    label.box.center_x - label.box.length / 2,
-                    label.box.center_y - label.box.width / 2,
-                    label.box.center_x + label.box.length / 2,
-                    label.box.center_y + label.box.width / 2
-                ]
-                id_to_bbox[label.id] = bbox
-                id_to_name[label.id] = name - 1
-
-        for obj in frame.laser_labels:
-            bounding_box = None
-            name = None
-            id = obj.id
-            for proj_cam in self.cam_list:
-                if id + proj_cam in id_to_bbox:
-                    bounding_box = id_to_bbox.get(id + proj_cam)
-                    name = str(id_to_name.get(id + proj_cam))
-                    break
-
-            # NOTE: the 2D labels do not have strict correspondence with
-            # the projected 2D lidar labels
-            # e.g.: the projected 2D labels can be in camera 2
-            # while the most_visible_camera can have id 4
-            if cam_sync:
-                if obj.most_visible_camera_name:
-                    name = str(
-                        self.cam_list.index(
-                            f'_{obj.most_visible_camera_name}'))
-                    box3d = obj.camera_synced_box
-                else:
-                    continue
-            else:
-                box3d = obj.box
-
-            if bounding_box is None or name is None:
-                name = '0'
-                bounding_box = (0, 0, 0, 0)
-
-            my_type = self.type_list[obj.type]
-
-            if my_type not in self.selected_waymo_classes:
-                continue
-
-            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
-                continue
-
-            my_type = self.waymo_to_kitti_class_map[my_type]
-
-            height = box3d.height
-            width = box3d.width
-            length = box3d.length
-
-            x = box3d.center_x
-            y = box3d.center_y
-            z = box3d.center_z - height / 2
-
-            # project bounding box to the virtual reference frame
-            pt_ref = self.T_velo_to_front_cam @ \
-                np.array([x, y, z, 1]).reshape((4, 1))
-            x, y, z, _ = pt_ref.flatten().tolist()
-
-            rotation_y = -box3d.heading - np.pi / 2
-            track_id = obj.id
-
-            # not available
-            truncated = 0
-            occluded = 0
-            alpha = -10
-
-            line = my_type + \
-                ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
-                    round(truncated, 2), occluded, round(alpha, 2),
-                    round(bounding_box[0], 2), round(bounding_box[1], 2),
-                    round(bounding_box[2], 2), round(bounding_box[3], 2),
-                    round(height, 2), round(width, 2), round(length, 2),
-                    round(x, 2), round(y, 2), round(z, 2),
-                    round(rotation_y, 2))
-
-            if self.save_track_id:
-                line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
-            else:
-                line_all = line[:-1] + ' ' + name + '\n'
-
-            label_path = f'{self.label_save_dir}{name}/{self.prefix}' + \
-                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-            if cam_sync:
-                label_path = label_path.replace('label_', 'cam_sync_label_')
-            fp_label = open(label_path, 'a')
-            fp_label.write(line)
-            fp_label.close()
-
-            fp_label_all.write(line_all)
-
-        fp_label_all.close()
-
-    def save_pose(self, frame, file_idx, frame_idx):
-        """Parse and save the pose data.
-
-        Note that SDC's own pose is not included in the regular training
-        of KITTI dataset. KITTI raw dataset contains ego motion files
-        but are not often used. Pose is important for algorithms that
-        take advantage of the temporal information.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        pose = np.array(frame.pose.transform).reshape(4, 4)
-        np.savetxt(
-            join(f'{self.pose_save_dir}/{self.prefix}' +
-                 f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
-            pose)
-
-    def save_timestamp(self, frame, file_idx, frame_idx):
-        """Save the timestamp data in a separate file instead of the
-        pointcloud.
-
-        Note that SDC's own pose is not included in the regular training
-        of KITTI dataset. KITTI raw dataset contains ego motion files
-        but are not often used. Pose is important for algorithms that
-        take advantage of the temporal information.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        with open(
-                join(f'{self.timestamp_save_dir}/{self.prefix}' +
-                     f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
-                'w') as f:
-            f.write(str(frame.timestamp_micros))
-
-    def create_folder(self):
-        """Create folder for data preprocessing."""
-        if not self.test_mode:
-            dir_list1 = [
-                self.label_all_save_dir,
-                self.calib_save_dir,
-                self.pose_save_dir,
-                self.timestamp_save_dir,
-            ]
-            dir_list2 = [self.label_save_dir, self.image_save_dir]
-            if self.save_cam_sync_labels:
-                dir_list1.append(self.cam_sync_label_all_save_dir)
-                dir_list2.append(self.cam_sync_label_save_dir)
-        else:
-            dir_list1 = [
-                self.calib_save_dir, self.pose_save_dir,
-                self.timestamp_save_dir
-            ]
-            dir_list2 = [self.image_save_dir]
-        if 'testing_3d_camera_only_detection' not in self.load_dir:
-            dir_list1.append(self.point_cloud_save_dir)
-        for d in dir_list1:
-            mmengine.mkdir_or_exist(d)
-        for d in dir_list2:
-            for i in range(5):
-                mmengine.mkdir_or_exist(f'{d}{str(i)}')
-
     def convert_range_image_to_point_cloud(self,
                                            frame,
                                            range_images,
@@ -604,29 +407,317 @@ def cart_to_homo(self, mat):
             raise ValueError(mat.shape)
         return ret
 
+    def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
+        r"""Generate waymo train/val/test infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        frame_infos = dict()
+
+        # Gather frame infos
+        sample_idx = \
+            f'{self.prefix}{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}'
+        frame_infos['sample_idx'] = int(sample_idx)
+        frame_infos['timestamp'] = frame.timestamp_micros
+        frame_infos['ego2global'] = np.array(frame.pose.transform).reshape(
+            4, 4).astype(np.float32).tolist()
+        frame_infos['context_name'] = frame.context.name
+
+        # Gather camera infos
+        frame_infos['images'] = dict()
+        # waymo front camera to kitti reference camera
+        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+                                       [1.0, 0.0, 0.0]])
+        camera_calibs = []
+        Tr_velo_to_cams = []
+        for camera in frame.context.camera_calibrations:
+            # extrinsic parameters
+            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+                4, 4)
+            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+            Tr_velo_to_cam = \
+                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+            Tr_velo_to_cams.append(Tr_velo_to_cam)
+
+            # intrinsic parameters
+            camera_calib = np.zeros((3, 4))
+            camera_calib[0, 0] = camera.intrinsic[0]
+            camera_calib[1, 1] = camera.intrinsic[1]
+            camera_calib[0, 2] = camera.intrinsic[2]
+            camera_calib[1, 2] = camera.intrinsic[3]
+            camera_calib[2, 2] = 1
+            camera_calibs.append(camera_calib)
+
+        for i, (cam_key, camera_calib, Tr_velo_to_cam) in enumerate(
+                zip(self.camera_types, camera_calibs, Tr_velo_to_cams)):
+            cam_infos = dict()
+            cam_infos['img_path'] = str(sample_idx) + '.jpg'
+            # NOTE: frames.images order is different
+            for img in frame.images:
+                if img.name == i + 1:
+                    width, height = Image.open(BytesIO(img.image)).size
+            cam_infos['height'] = height
+            cam_infos['width'] = width
+            cam_infos['lidar2cam'] = Tr_velo_to_cam.astype(np.float32).tolist()
+            cam_infos['cam2img'] = camera_calib.astype(np.float32).tolist()
+            cam_infos['lidar2img'] = (camera_calib @ Tr_velo_to_cam).astype(
+                np.float32).tolist()
+            frame_infos['images'][cam_key] = cam_infos
+
+        # Gather lidar infos
+        lidar_infos = dict()
+        lidar_infos['lidar_path'] = str(sample_idx) + '.bin'
+        lidar_infos['num_pts_feats'] = 6
+        frame_infos['lidar_points'] = lidar_infos
+
+        # Gather lidar sweeps and camera sweeps infos
+        # TODO: Add lidar2img in image sweeps infos when we need it.
+        # TODO: Consider merging lidar sweeps infos and image sweeps infos.
+        lidar_sweeps_infos, image_sweeps_infos = [], []
+        for prev_offset in range(-1, -self.max_sweeps - 1, -1):
+            prev_lidar_infos = dict()
+            prev_image_infos = dict()
+            if frame_idx + prev_offset >= 0:
+                prev_frame_infos = file_infos[prev_offset]
+                prev_lidar_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_lidar_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_lidar_infos['lidar_points'] = dict()
+                lidar_path = prev_frame_infos['lidar_points']['lidar_path']
+                prev_lidar_infos['lidar_points']['lidar_path'] = lidar_path
+                lidar_sweeps_infos.append(prev_lidar_infos)
+
+                prev_image_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_image_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_image_infos['images'] = dict()
+                for cam_key in self.camera_types:
+                    prev_image_infos['images'][cam_key] = dict()
+                    img_path = prev_frame_infos['images'][cam_key]['img_path']
+                    prev_image_infos['images'][cam_key]['img_path'] = img_path
+                image_sweeps_infos.append(prev_image_infos)
+        if lidar_sweeps_infos:
+            frame_infos['lidar_sweeps'] = lidar_sweeps_infos
+        if image_sweeps_infos:
+            frame_infos['image_sweeps'] = image_sweeps_infos
+
+        if not self.test_mode:
+            # Gather instances infos which is used for lidar-based 3D detection
+            frame_infos['instances'] = self.gather_instance_info(frame)
+            # Gather cam_sync_instances infos which is used for image-based
+            # (multi-view) 3D detection.
+            if self.save_cam_sync_instances:
+                frame_infos['cam_sync_instances'] = self.gather_instance_info(
+                    frame, cam_sync=True)
+            # Gather cam_instances infos which is used for image-based
+            # (monocular) 3D detection (optional).
+            # TODO: Should we use cam_sync_instances to generate cam_instances?
+            if self.save_cam_instances:
+                frame_infos['cam_instances'] = self.gather_cam_instance_info(
+                    copy.deepcopy(frame_infos['instances']),
+                    frame_infos['images'])
+        file_infos.append(frame_infos)
+
+    def gather_instance_info(self, frame, cam_sync=False):
+        """Generate instances and cam_sync_instances infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        id_to_bbox = dict()
+        id_to_name = dict()
+        for labels in frame.projected_lidar_labels:
+            name = labels.name
+            for label in labels.labels:
+                # TODO: need a workaround as bbox may not belong to front cam
+                bbox = [
+                    label.box.center_x - label.box.length / 2,
+                    label.box.center_y - label.box.width / 2,
+                    label.box.center_x + label.box.length / 2,
+                    label.box.center_y + label.box.width / 2
+                ]
+                id_to_bbox[label.id] = bbox
+                id_to_name[label.id] = name - 1
+
+        group_id = 0
+        instance_infos = []
+        for obj in frame.laser_labels:
+            instance_info = dict()
+            bounding_box = None
+            name = None
+            id = obj.id
+            for proj_cam in self.cam_list:
+                if id + proj_cam in id_to_bbox:
+                    bounding_box = id_to_bbox.get(id + proj_cam)
+                    name = id_to_name.get(id + proj_cam)
+                    break
+
+            # NOTE: the 2D labels do not have strict correspondence with
+            # the projected 2D lidar labels
+            # e.g.: the projected 2D labels can be in camera 2
+            # while the most_visible_camera can have id 4
+            if cam_sync:
+                if obj.most_visible_camera_name:
+                    name = self.cam_list.index(
+                        f'_{obj.most_visible_camera_name}')
+                    box3d = obj.camera_synced_box
+                else:
+                    continue
+            else:
+                box3d = obj.box
+
+            if bounding_box is None or name is None:
+                name = 0
+                bounding_box = [0.0, 0.0, 0.0, 0.0]
+
+            my_type = self.type_list[obj.type]
+
+            if my_type not in self.selected_waymo_classes:
+                continue
+            else:
+                label = self.selected_waymo_classes.index(my_type)
+
+            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+                continue
+
+            group_id += 1
+            instance_info['group_id'] = group_id
+            instance_info['camera_id'] = name
+            instance_info['bbox'] = bounding_box
+            instance_info['bbox_label'] = label
+
+            height = box3d.height
+            width = box3d.width
+            length = box3d.length
+
+            # NOTE: We save the bottom center of 3D bboxes.
+            x = box3d.center_x
+            y = box3d.center_y
+            z = box3d.center_z - height / 2
+
+            rotation_y = box3d.heading
+
+            instance_info['bbox_3d'] = np.array(
+                [x, y, z, length, width, height,
+                 rotation_y]).astype(np.float32).tolist()
+            instance_info['bbox_label_3d'] = label
+            instance_info['num_lidar_pts'] = obj.num_lidar_points_in_box
+
+            if self.save_track_id:
+                instance_info['track_id'] = obj.id
+            instance_infos.append(instance_info)
+        return instance_infos
+
+    def gather_cam_instance_info(self, instances: dict, images: dict):
+        """Generate cam_instances infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        cam_instances = dict()
+        for cam_type in self.camera_types:
+            lidar2cam = np.array(images[cam_type]['lidar2cam'])
+            cam2img = np.array(images[cam_type]['cam2img'])
+            cam_instances[cam_type] = []
+            for instance in instances:
+                cam_instance = dict()
+                gt_bboxes_3d = np.array(instance['bbox_3d'])
+                # Convert lidar coordinates to camera coordinates
+                gt_bboxes_3d = LiDARInstance3DBoxes(
+                    gt_bboxes_3d[None, :]).convert_to(
+                        Box3DMode.CAM, lidar2cam, correct_yaw=True)
+                corners_3d = gt_bboxes_3d.corners.numpy()
+                corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+                in_camera = np.argwhere(corners_3d[2, :] > 0).flatten()
+                corners_3d = corners_3d[:, in_camera]
+                # Project 3d box to 2d.
+                corner_coords = view_points(corners_3d, cam2img,
+                                            True).T[:, :2].tolist()
+
+                # Keep only corners that fall within the image.
+                # TODO: imsize should be determined by the current image size
+                # CAM_FRONT: (1920, 1280)
+                # CAM_FRONT_LEFT: (1920, 1280)
+                # CAM_SIDE_LEFT: (1920, 886)
+                final_coords = post_process_coords(
+                    corner_coords,
+                    imsize=(images['CAM_FRONT']['width'],
+                            images['CAM_FRONT']['height']))
+
+                # Skip if the convex hull of the re-projected corners
+                # does not intersect the image canvas.
+                if final_coords is None:
+                    continue
+                else:
+                    min_x, min_y, max_x, max_y = final_coords
+
+                cam_instance['bbox'] = [min_x, min_y, max_x, max_y]
+                cam_instance['bbox_label'] = instance['bbox_label']
+                cam_instance['bbox_3d'] = gt_bboxes_3d.numpy().squeeze(
+                ).astype(np.float32).tolist()
+                cam_instance['bbox_label_3d'] = instance['bbox_label_3d']
+
+                center_3d = gt_bboxes_3d.gravity_center.numpy()
+                center_2d_with_depth = points_cam2img(
+                    center_3d, cam2img, with_depth=True)
+                center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+
+                # normalized center2D + depth
+                # if samples with depth < 0 will be removed
+                if center_2d_with_depth[2] <= 0:
+                    continue
+                cam_instance['center_2d'] = center_2d_with_depth[:2]
+                cam_instance['depth'] = center_2d_with_depth[2]
+
+                # TODO: Discuss whether following info is necessary
+                cam_instance['bbox_3d_isvalid'] = True
+                cam_instance['velocity'] = -1
+                cam_instances[cam_type].append(cam_instance)
+
+        return cam_instances
+
+    def merge_trainval_infos(self):
+        """Merge training and validation infos into a single file."""
+        train_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_train.pkl')
+        val_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_val.pkl')
+        train_infos = mmengine.load(train_infos_path)
+        val_infos = mmengine.load(val_infos_path)
+        trainval_infos = dict(
+            metainfo=train_infos['metainfo'],
+            data_list=train_infos['data_list'] + val_infos['data_list'])
+        mmengine.dump(
+            trainval_infos,
+            osp.join(
+                osp.dirname(self.save_dir),
+                f'{self.info_prefix}_infos_trainval.pkl'))
+
 
 def create_ImageSets_img_ids(root_dir, splits):
+    """Create txt files indicating what to collect in each split."""
     save_dir = join(root_dir, 'ImageSets/')
     if not exists(save_dir):
         os.mkdir(save_dir)
 
-    idx_all = [[] for i in splits]
+    idx_all = [[] for _ in splits]
     for i, split in enumerate(splits):
-        path = join(root_dir, splits[i], 'calib')
+        path = join(root_dir, split, 'image_0')
         if not exists(path):
             RawNames = []
         else:
             RawNames = os.listdir(path)
 
         for name in RawNames:
-            if name.endswith('.txt'):
-                idx = name.replace('.txt', '\n')
+            if name.endswith('.jpg'):
+                idx = name.replace('.jpg', '\n')
                 idx_all[int(idx[0])].append(idx)
         idx_all[i].sort()
 
     open(save_dir + 'train.txt', 'w').writelines(idx_all[0])
     open(save_dir + 'val.txt', 'w').writelines(idx_all[1])
     open(save_dir + 'trainval.txt', 'w').writelines(idx_all[0] + idx_all[1])
-    open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
-    # open(save_dir+'test_cam_only.txt','w').writelines(idx_all[3])
+    if len(idx_all) >= 3:
+        open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
+    if len(idx_all) >= 4:
+        open(save_dir + 'test_cam_only.txt', 'w').writelines(idx_all[3])
     print('created txt files indicating what to collect in ', splits)

From 762e3b53ca30e4cb0302a40330a609216236e541 Mon Sep 17 00:00:00 2001
From: Sun Jiahao <72679458+sunjiahao1999@users.noreply.github.com>
Date: Thu, 28 Dec 2023 21:34:59 +0800
Subject: [PATCH 3/7] [Feature] Support DSVT training (#2738)

Co-authored-by: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Co-authored-by: sjh <sunjiahao1999>
---
 .../models/dense_heads/centerpoint_head.py    |   4 +-
 mmdet3d/models/necks/second_fpn.py            |  18 +-
 mmdet3d/structures/bbox_3d/base_box3d.py      |  13 +-
 projects/DSVT/README.md                       |  18 +-
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py | 146 +++++--
 projects/DSVT/dsvt/__init__.py                |   5 +-
 projects/DSVT/dsvt/disable_aug_hook.py        |  69 ++++
 projects/DSVT/dsvt/dsvt.py                    |   6 +-
 projects/DSVT/dsvt/dsvt_head.py               | 391 +++++++++++++++++-
 projects/DSVT/dsvt/dynamic_pillar_vfe.py      |   2 +
 projects/DSVT/dsvt/res_second.py              |  19 +-
 projects/DSVT/dsvt/transforms_3d.py           | 116 ++++++
 projects/DSVT/dsvt/utils.py                   | 144 ++++++-
 tools/train.py                                |  10 +
 14 files changed, 875 insertions(+), 86 deletions(-)
 create mode 100644 projects/DSVT/dsvt/disable_aug_hook.py
 create mode 100644 projects/DSVT/dsvt/transforms_3d.py

diff --git a/mmdet3d/models/dense_heads/centerpoint_head.py b/mmdet3d/models/dense_heads/centerpoint_head.py
index 12ba84234e..c3fc187964 100644
--- a/mmdet3d/models/dense_heads/centerpoint_head.py
+++ b/mmdet3d/models/dense_heads/centerpoint_head.py
@@ -101,7 +101,7 @@ def forward(self, x):
         Returns:
             dict[str: torch.Tensor]: contains the following keys:
 
-                -reg （torch.Tensor): 2D regression value with the
+                -reg (torch.Tensor): 2D regression value with the
                     shape of [B, 2, H, W].
                 -height (torch.Tensor): Height value with the
                     shape of [B, 1, H, W].
@@ -217,7 +217,7 @@ def forward(self, x):
         Returns:
             dict[str: torch.Tensor]: contains the following keys:
 
-                -reg （torch.Tensor): 2D regression value with the
+                -reg (torch.Tensor): 2D regression value with the
                     shape of [B, 2, H, W].
                 -height (torch.Tensor): Height value with the
                     shape of [B, 1, H, W].
diff --git a/mmdet3d/models/necks/second_fpn.py b/mmdet3d/models/necks/second_fpn.py
index 90e57ec05c..d4dc590c15 100644
--- a/mmdet3d/models/necks/second_fpn.py
+++ b/mmdet3d/models/necks/second_fpn.py
@@ -21,6 +21,10 @@ class SECONDFPN(BaseModule):
         upsample_cfg (dict): Config dict of upsample layers.
         conv_cfg (dict): Config dict of conv layers.
         use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
+        init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`],
+            optional): Initialization config dict. Defaults to
+            [dict(type='Kaiming', layer='ConvTranspose2d'),
+             dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)].
     """
 
     def __init__(self,
@@ -31,7 +35,13 @@ def __init__(self,
                  upsample_cfg=dict(type='deconv', bias=False),
                  conv_cfg=dict(type='Conv2d', bias=False),
                  use_conv_for_no_stride=False,
-                 init_cfg=None):
+                 init_cfg=[
+                     dict(type='Kaiming', layer='ConvTranspose2d'),
+                     dict(
+                         type='Constant',
+                         layer='NaiveSyncBatchNorm2d',
+                         val=1.0)
+                 ]):
         # if for GroupNorm,
         # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
         super(SECONDFPN, self).__init__(init_cfg=init_cfg)
@@ -64,12 +74,6 @@ def __init__(self,
             deblocks.append(deblock)
         self.deblocks = nn.ModuleList(deblocks)
 
-        if init_cfg is None:
-            self.init_cfg = [
-                dict(type='Kaiming', layer='ConvTranspose2d'),
-                dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)
-            ]
-
     def forward(self, x):
         """Forward function.
 
diff --git a/mmdet3d/structures/bbox_3d/base_box3d.py b/mmdet3d/structures/bbox_3d/base_box3d.py
index 50b092c06e..7fb703c731 100644
--- a/mmdet3d/structures/bbox_3d/base_box3d.py
+++ b/mmdet3d/structures/bbox_3d/base_box3d.py
@@ -275,12 +275,13 @@ def in_range_3d(
             Tensor: A binary vector indicating whether each point is inside the
             reference range.
         """
-        in_range_flags = ((self.tensor[:, 0] > box_range[0])
-                          & (self.tensor[:, 1] > box_range[1])
-                          & (self.tensor[:, 2] > box_range[2])
-                          & (self.tensor[:, 0] < box_range[3])
-                          & (self.tensor[:, 1] < box_range[4])
-                          & (self.tensor[:, 2] < box_range[5]))
+        gravity_center = self.gravity_center
+        in_range_flags = ((gravity_center[:, 0] > box_range[0])
+                          & (gravity_center[:, 1] > box_range[1])
+                          & (gravity_center[:, 2] > box_range[2])
+                          & (gravity_center[:, 0] < box_range[3])
+                          & (gravity_center[:, 1] < box_range[4])
+                          & (gravity_center[:, 2] < box_range[5]))
         return in_range_flags
 
     @abstractmethod
diff --git a/projects/DSVT/README.md b/projects/DSVT/README.md
index a4b45b570d..d60e49abe5 100644
--- a/projects/DSVT/README.md
+++ b/projects/DSVT/README.md
@@ -57,17 +57,25 @@ python tools/test.py projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-
 
 ### Training commands
 
-The support of training DSVT is on the way.
+In MMDetection3D's root directory, run the following command to test the model:
+
+```bash
+tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py 8 --sync_bn torch
+```
 
 ## Results and models
 
 ### Waymo
 
-|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
-| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :------: |
-| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |          |                |  75.2  |  72.2   |  68.9  |    66.1     |          |
+|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                             Download                                                                             |
+| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | \[log\](\<https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
+
+**Note**:
+
+- `ResSECOND` denotes the base block in SECOND has residual layers.
 
-**Note** that `ResSECOND` denotes the base block in SECOND has residual layers.
+- Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
 
 ## Citation
 
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index ac7a38b9bc..7d2faf854c 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -88,25 +88,28 @@
         loss_cls=dict(
             type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
         loss_bbox=dict(type='mmdet.L1Loss', reduction='mean', loss_weight=2.0),
+        loss_iou=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.0),
+        loss_reg_iou=dict(
+            type='mmdet3d.DIoU3DLoss', reduction='mean', loss_weight=2.0),
         norm_bbox=True),
     # model training and testing settings
     train_cfg=dict(
-        pts=dict(
-            grid_size=grid_size,
-            voxel_size=voxel_size,
-            out_size_factor=4,
-            dense_reg=1,
-            gaussian_overlap=0.1,
-            max_objs=500,
-            min_radius=2,
-            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])),
+        grid_size=grid_size,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=1,
+        dense_reg=1,
+        gaussian_overlap=0.1,
+        max_objs=500,
+        min_radius=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]),
     test_cfg=dict(
         max_per_img=500,
         max_pool_nms=False,
         min_radius=[4, 12, 10, 1, 0.85, 0.175],
         iou_rectifier=[[0.68, 0.71, 0.65]],
         pc_range=[-80, -80],
-        out_size_factor=4,
+        out_size_factor=1,
         voxel_size=voxel_size[:2],
         nms_type='rotate',
         multi_class_nms=True,
@@ -128,6 +131,8 @@
         coord_type='LIDAR',
         load_dim=6,
         use_dim=[0, 1, 2, 3, 4],
+        norm_intensity=True,
+        norm_elongation=True,
         backend_args=backend_args),
     backend_args=backend_args)
 
@@ -138,25 +143,22 @@
         load_dim=6,
         use_dim=5,
         norm_intensity=True,
+        norm_elongation=True,
         backend_args=backend_args),
-    # Add this if using `MultiFrameDeformableDecoderRPN`
-    # dict(
-    #     type='LoadPointsFromMultiSweeps',
-    #     sweeps_num=9,
-    #     load_dim=6,
-    #     use_dim=[0, 1, 2, 3, 4],
-    #     pad_empty_sweeps=True,
-    #     remove_close=True),
     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
     dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
     dict(
         type='GlobalRotScaleTrans',
         rot_range=[-0.78539816, 0.78539816],
         scale_ratio_range=[0.95, 1.05],
-        translation_std=[0.5, 0.5, 0]),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
+        translation_std=[0.5, 0.5, 0.5]),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter3D', point_cloud_range=point_cloud_range),
     dict(type='PointShuffle'),
     dict(
         type='Pack3DDetInputs',
@@ -172,25 +174,34 @@
         norm_intensity=True,
         norm_elongation=True,
         backend_args=backend_args),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
     dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 800),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(
-                type='GlobalRotScaleTrans',
-                rot_range=[0, 0],
-                scale_ratio_range=[1., 1.],
-                translation_std=[0, 0, 0]),
-            dict(type='RandomFlip3D'),
-            dict(
-                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
-        ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 
 dataset_type = 'WaymoDataset'
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        # load one frame every five frames
+        load_interval=5,
+        backend_args=backend_args))
 val_dataloader = dict(
     batch_size=4,
     num_workers=4,
@@ -212,18 +223,59 @@
 
 val_evaluator = dict(
     type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
     waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
+    result_prefix='./dsvt_pred')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
 visualizer = dict(
     type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
+# schedules
+lr = 1e-5
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.05, betas=(0.9, 0.99)),
+    clip_grad=dict(max_norm=10, norm_type=2))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=1.2,
+        eta_min=lr * 100,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=10.8,
+        eta_min=lr * 1e-4,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=1.2,
+        eta_min=0.85,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=10.8,
+        eta_min=0.95,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=12, val_interval=1)
+
 # runtime settings
 val_cfg = dict()
 test_cfg = dict()
@@ -236,4 +288,12 @@
 
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),
-    checkpoint=dict(type='CheckpointHook', interval=5))
+    checkpoint=dict(type='CheckpointHook', interval=1))
+custom_hooks = [
+    dict(
+        type='DisableAugHook',
+        disable_after_epoch=11,
+        disable_aug_list=[
+            'GlobalRotScaleTrans', 'RandomFlip3D', 'ObjectSample'
+        ])
+]
diff --git a/projects/DSVT/dsvt/__init__.py b/projects/DSVT/dsvt/__init__.py
index 4fff505fb3..a3666f0535 100644
--- a/projects/DSVT/dsvt/__init__.py
+++ b/projects/DSVT/dsvt/__init__.py
@@ -1,12 +1,15 @@
+from .disable_aug_hook import DisableAugHook
 from .dsvt import DSVT
 from .dsvt_head import DSVTCenterHead
 from .dsvt_transformer import DSVTMiddleEncoder
 from .dynamic_pillar_vfe import DynamicPillarVFE3D
 from .map2bev import PointPillarsScatter3D
 from .res_second import ResSECOND
+from .transforms_3d import ObjectRangeFilter3D, PointsRangeFilter3D
 from .utils import DSVTBBoxCoder
 
 __all__ = [
     'DSVTCenterHead', 'DSVT', 'DSVTMiddleEncoder', 'DynamicPillarVFE3D',
-    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder'
+    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder',
+    'ObjectRangeFilter3D', 'PointsRangeFilter3D', 'DisableAugHook'
 ]
diff --git a/projects/DSVT/dsvt/disable_aug_hook.py b/projects/DSVT/dsvt/disable_aug_hook.py
new file mode 100644
index 0000000000..5a4dff5fb2
--- /dev/null
+++ b/projects/DSVT/dsvt/disable_aug_hook.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+from mmengine.dataset import BaseDataset
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class DisableAugHook(Hook):
+    """The hook of disabling augmentations during training.
+
+    Args:
+        disable_after_epoch (int): The number of epochs after which
+            the data augmentation will be closed in the training.
+            Defaults to 15.
+        disable_aug_list (list): the list of data augmentation will
+            be closed in the training. Defaults to [].
+    """
+
+    def __init__(self,
+                 disable_after_epoch: int = 15,
+                 disable_aug_list: List = []):
+        self.disable_after_epoch = disable_after_epoch
+        self.disable_aug_list = disable_aug_list
+        self._restart_dataloader = False
+
+    def before_train_epoch(self, runner: Runner):
+        """Close augmentation.
+
+        Args:
+            runner (Runner): The runner.
+        """
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        if epoch == self.disable_after_epoch:
+
+            dataset = runner.train_dataloader.dataset
+            # handle dataset wrapper
+            if not isinstance(dataset, BaseDataset):
+                dataset = dataset.dataset
+            new_transforms = []
+            for transform in dataset.pipeline.transforms:  # noqa: E501
+                if transform.__class__.__name__ not in self.disable_aug_list:
+                    new_transforms.append(transform)
+                else:
+                    runner.logger.info(
+                        f'Disable {transform.__class__.__name__}')
+            dataset.pipeline.transforms = new_transforms
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/projects/DSVT/dsvt/dsvt.py b/projects/DSVT/dsvt/dsvt.py
index 3c5405679f..e6f6ceffbe 100644
--- a/projects/DSVT/dsvt/dsvt.py
+++ b/projects/DSVT/dsvt/dsvt.py
@@ -103,7 +103,11 @@ def loss(self, batch_inputs_dict: Dict[List, torch.Tensor],
         Returns:
             dict[str, Tensor]: A dictionary of loss components.
         """
-        pass
+        pts_feats = self.extract_feat(batch_inputs_dict)
+        losses = dict()
+        loss = self.bbox_head.loss(pts_feats, batch_data_samples)
+        losses.update(loss)
+        return losses
 
     def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
                 batch_data_samples: List[Det3DDataSample],
diff --git a/projects/DSVT/dsvt/dsvt_head.py b/projects/DSVT/dsvt/dsvt_head.py
index 35167919f0..ffc7669234 100644
--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
@@ -1,12 +1,19 @@
+import math
 from typing import Dict, List, Tuple
 
 import torch
+import torch.nn as nn
+from mmcv.ops import boxes_iou3d
 from mmdet.models.utils import multi_apply
+from mmengine.model import kaiming_init
 from mmengine.structures import InstanceData
 from torch import Tensor
+from torch.nn.init import constant_
 
 from mmdet3d.models import CenterHead
 from mmdet3d.models.layers import circle_nms, nms_bev
+from mmdet3d.models.utils import (clip_sigmoid, draw_heatmap_gaussian,
+                                  gaussian_radius)
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
 
@@ -18,8 +25,33 @@ class DSVTCenterHead(CenterHead):
     This head adds IoU prediction branch based on the original CenterHead.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self,
+                 loss_iou=dict(
+                     type='mmdet.L1Loss', reduction='mean', loss_weight=1),
+                 loss_reg_iou=None,
+                 *args,
+                 **kwargs):
         super(DSVTCenterHead, self).__init__(*args, **kwargs)
+        self.loss_iou = MODELS.build(loss_iou)
+        self.loss_iou_reg = MODELS.build(
+            loss_reg_iou) if loss_reg_iou is not None else None
+
+    def init_weights(self):
+        kaiming_init(
+            self.shared_conv.conv,
+            a=math.sqrt(5),
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform')
+        for head in self.task_heads[0].heads:
+            if head == 'heatmap':
+                constant_(self.task_heads[0].__getattr__(head)[-1].bias,
+                          self.task_heads[0].init_bias)
+            else:
+                for m in self.task_heads[0].__getattr__(head).modules():
+                    if isinstance(m, nn.Conv2d):
+                        kaiming_init(
+                            m, mode='fan_in', nonlinearity='leaky_relu')
 
     def forward_single(self, x: Tensor) -> dict:
         """Forward function for CenterPoint.
@@ -66,7 +98,298 @@ def loss(self, pts_feats: List[Tensor],
         Returns:
             dict: Losses of each branch.
         """
-        pass
+        outs = self(pts_feats)
+        batch_gt_instance_3d = []
+        for data_sample in batch_data_samples:
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+        losses = self.loss_by_feat(outs, batch_gt_instance_3d)
+        return losses
+
+    def _decode_all_preds(self,
+                          pred_dict,
+                          point_cloud_range=None,
+                          voxel_size=None):
+        batch_size, _, H, W = pred_dict['reg'].shape
+
+        batch_center = pred_dict['reg'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2)  # (B, H, W, 2)
+        batch_center_z = pred_dict['height'].permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_dim = pred_dict['dim'].exp().permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 3)  # (B, H, W, 3)
+        batch_rot_cos = pred_dict['rot'][:, 0].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_rot_sin = pred_dict['rot'][:, 1].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_vel = pred_dict['vel'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2) if 'vel' in pred_dict.keys() else None
+
+        angle = torch.atan2(batch_rot_sin, batch_rot_cos)  # (B, H*W, 1)
+
+        ys, xs = torch.meshgrid([
+            torch.arange(
+                0, H, device=batch_center.device, dtype=batch_center.dtype),
+            torch.arange(
+                0, W, device=batch_center.device, dtype=batch_center.dtype)
+        ])
+        ys = ys.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(batch_size, -1, 1) + batch_center[:, :, 0:1]
+        ys = ys.view(batch_size, -1, 1) + batch_center[:, :, 1:2]
+
+        xs = xs * voxel_size[0] + point_cloud_range[0]
+        ys = ys * voxel_size[1] + point_cloud_range[1]
+
+        box_part_list = [xs, ys, batch_center_z, batch_dim, angle]
+        if batch_vel is not None:
+            box_part_list.append(batch_vel)
+
+        box_preds = torch.cat((box_part_list),
+                              dim=-1).view(batch_size, H, W, -1)
+
+        return box_preds
+
+    def _transpose_and_gather_feat(self, feat, ind):
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)
+        return feat
+
+    def calc_iou_loss(self, iou_preds, batch_box_preds, mask, ind, gt_boxes):
+        """
+        Args:
+            iou_preds: (batch x 1 x h x w)
+            batch_box_preds: (batch x (7 or 9) x h x w)
+            mask: (batch x max_objects)
+            ind: (batch x max_objects)
+            gt_boxes: List of batch groundtruth boxes.
+
+        Returns:
+            Tensor: IoU Loss.
+        """
+        if mask.sum() == 0:
+            return iou_preds.new_zeros((1))
+
+        mask = mask.bool()
+        selected_iou_preds = self._transpose_and_gather_feat(iou_preds,
+                                                             ind)[mask]
+
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        iou_target = boxes_iou3d(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+        iou_target = torch.diag(iou_target).view(-1)
+        iou_target = iou_target * 2 - 1  # [0, 1] ==> [-1, 1]
+
+        loss = self.loss_iou(selected_iou_preds.view(-1), iou_target)
+        loss = loss / torch.clamp(mask.sum(), min=1e-4)
+        return loss
+
+    def calc_iou_reg_loss(self, batch_box_preds, mask, ind, gt_boxes):
+        if mask.sum() == 0:
+            return batch_box_preds.new_zeros((1))
+
+        mask = mask.bool()
+
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        loss = self.loss_iou_reg(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+
+        return loss
+
+    def get_targets(
+        self,
+        batch_gt_instances_3d: List[InstanceData],
+    ) -> Tuple[List[Tensor]]:
+        """Generate targets.
+
+        How each output is transformed:
+
+            Each nested list is transposed so that all same-index elements in
+            each sub-list (1, ..., N) become the new sub-lists.
+                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
+                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
+
+            The new transposed nested list is converted into a list of N
+            tensors generated by concatenating tensors in the new sub-lists.
+                [ tensor0, tensor1, tensor2, ... ]
+
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and\
+                ``labels_3d`` attributes.
+
+        Returns:
+            Returns:
+                tuple[list[torch.Tensor]]: Tuple of target including
+                    the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the
+                    position of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which
+                    boxes are valid.
+        """
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = multi_apply(
+            self.get_targets_single, batch_gt_instances_3d)
+        # Transpose heatmaps
+        heatmaps = list(map(list, zip(*heatmaps)))
+        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
+        # Transpose anno_boxes
+        anno_boxes = list(map(list, zip(*anno_boxes)))
+        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
+        # Transpose inds
+        inds = list(map(list, zip(*inds)))
+        inds = [torch.stack(inds_) for inds_ in inds]
+        # Transpose masks
+        masks = list(map(list, zip(*masks)))
+        masks = [torch.stack(masks_) for masks_ in masks]
+        # Transpose task_gt_bboxes
+        task_gt_bboxes = list(map(list, zip(*task_gt_bboxes)))
+        return heatmaps, anno_boxes, inds, masks, task_gt_bboxes
+
+    def get_targets_single(self,
+                           gt_instances_3d: InstanceData) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            gt_instances_3d (:obj:`InstanceData`): Gt_instances_3d of
+                single data sample. It usually includes
+                ``bboxes_3d`` and ``labels_3d`` attributes.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Tuple of target including
+                the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the position
+                    of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which boxes
+                    are valid.
+        """
+        gt_labels_3d = gt_instances_3d.labels_3d
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        device = gt_labels_3d.device
+        gt_bboxes_3d = torch.cat(
+            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
+            dim=1).to(device)
+        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
+        grid_size = torch.tensor(self.train_cfg['grid_size']).to(device)
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+
+        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
+
+        # reorganize the gt_dict by tasks
+        task_masks = []
+        flag = 0
+        for class_name in self.class_names:
+            task_masks.append([
+                torch.where(gt_labels_3d == class_name.index(i) + flag)
+                for i in class_name
+            ])
+            flag += len(class_name)
+
+        task_boxes = []
+        task_classes = []
+        flag2 = 0
+        for idx, mask in enumerate(task_masks):
+            task_box = []
+            task_class = []
+            for m in mask:
+                task_box.append(gt_bboxes_3d[m])
+                # 0 is background for each task, so we need to add 1 here.
+                task_class.append(gt_labels_3d[m] + 1 - flag2)
+            task_boxes.append(torch.cat(task_box, axis=0).to(device))
+            task_classes.append(torch.cat(task_class).long().to(device))
+            flag2 += len(mask)
+        draw_gaussian = draw_heatmap_gaussian
+        heatmaps, anno_boxes, inds, masks = [], [], [], []
+
+        for idx, task_head in enumerate(self.task_heads):
+            heatmap = gt_bboxes_3d.new_zeros(
+                (len(self.class_names[idx]), feature_map_size[1],
+                 feature_map_size[0]))
+
+            anno_box = gt_bboxes_3d.new_zeros((max_objs, 8),
+                                              dtype=torch.float32)
+
+            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
+            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
+
+            num_objs = min(task_boxes[idx].shape[0], max_objs)
+
+            for k in range(num_objs):
+                cls_id = task_classes[idx][k] - 1
+
+                length = task_boxes[idx][k][3]
+                width = task_boxes[idx][k][4]
+                length = length / voxel_size[0] / self.train_cfg[
+                    'out_size_factor']
+                width = width / voxel_size[1] / self.train_cfg[
+                    'out_size_factor']
+
+                if width > 0 and length > 0:
+                    radius = gaussian_radius(
+                        (width, length),
+                        min_overlap=self.train_cfg['gaussian_overlap'])
+                    radius = max(self.train_cfg['min_radius'], int(radius))
+
+                    # be really careful for the coordinate system of
+                    # your box annotation.
+                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
+                        1], task_boxes[idx][k][2]
+
+                    coor_x = (
+                        x - pc_range[0]
+                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
+                    coor_y = (
+                        y - pc_range[1]
+                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
+
+                    center = torch.tensor([coor_x, coor_y],
+                                          dtype=torch.float32,
+                                          device=device)
+                    center_int = center.to(torch.int32)
+
+                    # throw out not in range objects to avoid out of array
+                    # area when creating the heatmap
+                    if not (0 <= center_int[0] < feature_map_size[0]
+                            and 0 <= center_int[1] < feature_map_size[1]):
+                        continue
+
+                    draw_gaussian(heatmap[cls_id], center_int, radius)
+
+                    new_idx = k
+                    x, y = center_int[0], center_int[1]
+
+                    assert (y * feature_map_size[0] + x <
+                            feature_map_size[0] * feature_map_size[1])
+
+                    ind[new_idx] = y * feature_map_size[0] + x
+                    mask[new_idx] = 1
+                    # TODO: support other outdoor dataset
+                    rot = task_boxes[idx][k][6]
+                    box_dim = task_boxes[idx][k][3:6]
+                    if self.norm_bbox:
+                        box_dim = box_dim.log()
+                    anno_box[new_idx] = torch.cat([
+                        center - torch.tensor([x, y], device=device),
+                        z.unsqueeze(0), box_dim,
+                        torch.cos(rot).unsqueeze(0),
+                        torch.sin(rot).unsqueeze(0)
+                    ])
+
+            heatmaps.append(heatmap)
+            anno_boxes.append(anno_box)
+            masks.append(mask)
+            inds.append(ind)
+        return heatmaps, anno_boxes, inds, masks, task_boxes
 
     def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                      batch_gt_instances_3d: List[InstanceData], *args,
@@ -79,13 +402,72 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                 tasks head, and the internal list indicate different
                 FPN level.
             batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instances. It usually includes ``bboxes_3d`` and\
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
                 ``labels_3d`` attributes.
 
         Returns:
             dict[str,torch.Tensor]: Loss of heatmap and bbox of each task.
         """
-        pass
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = self.get_targets(
+            batch_gt_instances_3d)
+        loss_dict = dict()
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # heatmap focal loss
+            preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
+            num_pos = heatmaps[task_id].eq(1).float().sum().item()
+            loss_heatmap = self.loss_cls(
+                preds_dict[0]['heatmap'],
+                heatmaps[task_id],
+                avg_factor=max(num_pos, 1))
+            target_box = anno_boxes[task_id]
+            # reconstruct the anno_box from multiple reg heads
+            preds_dict[0]['anno_box'] = torch.cat(
+                (preds_dict[0]['reg'], preds_dict[0]['height'],
+                 preds_dict[0]['dim'], preds_dict[0]['rot']),
+                dim=1)
+
+            # Regression loss for dimension, offset, height, rotation
+            ind = inds[task_id]
+            num = masks[task_id].float().sum()
+            pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
+            pred = pred.view(pred.size(0), -1, pred.size(3))
+            pred = self._gather_feat(pred, ind)
+            mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
+            isnotnan = (~torch.isnan(target_box)).float()
+            mask *= isnotnan
+
+            code_weights = self.train_cfg.get('code_weights', None)
+            bbox_weights = mask * mask.new_tensor(code_weights)
+            loss_bbox = self.loss_bbox(
+                pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
+            loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
+            loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
+
+            if 'iou' in preds_dict[0]:
+                batch_box_preds = self._decode_all_preds(
+                    pred_dict=preds_dict[0],
+                    point_cloud_range=self.train_cfg['point_cloud_range'],
+                    voxel_size=self.train_cfg['voxel_size']
+                )  # (B, H, W, 7 or 9)
+
+                batch_box_preds_for_iou = batch_box_preds.permute(
+                    0, 3, 1, 2)  # (B, 7 or 9, H, W)
+                loss_dict[f'task{task_id}.loss_iou'] = self.calc_iou_loss(
+                    iou_preds=preds_dict[0]['iou'],
+                    batch_box_preds=batch_box_preds_for_iou.clone().detach(),
+                    mask=masks[task_id],
+                    ind=ind,
+                    gt_boxes=task_gt_bboxes[task_id])
+
+                if self.loss_iou_reg is not None:
+                    loss_dict[f'task{task_id}.loss_reg_iou'] = \
+                        self.calc_iou_reg_loss(
+                            batch_box_preds=batch_box_preds_for_iou,
+                            mask=masks[task_id],
+                            ind=ind,
+                            gt_boxes=task_gt_bboxes[task_id])
+
+        return loss_dict
 
     def predict(self,
                 pts_feats: Tuple[torch.Tensor],
@@ -158,6 +540,7 @@ def predict_by_feat(self, preds_dicts: Tuple[List[dict]],
             else:
                 batch_dim = preds_dict[0]['dim']
 
+            # It's different from CenterHead
             batch_rotc = preds_dict[0]['rot'][:, 0].unsqueeze(1)
             batch_rots = preds_dict[0]['rot'][:, 1].unsqueeze(1)
             batch_iou = (preds_dict[0]['iou'] +
diff --git a/projects/DSVT/dsvt/dynamic_pillar_vfe.py b/projects/DSVT/dsvt/dynamic_pillar_vfe.py
index 97c75aaf00..3fc5266c56 100644
--- a/projects/DSVT/dsvt/dynamic_pillar_vfe.py
+++ b/projects/DSVT/dsvt/dynamic_pillar_vfe.py
@@ -1,4 +1,5 @@
 # modified from https://github.com/Haiyang-W/DSVT
+import numpy as np
 import torch
 import torch.nn as nn
 import torch_scatter
@@ -76,6 +77,7 @@ def __init__(self, with_distance, use_absolute_xyz, use_norm, num_filters,
         self.voxel_x = voxel_size[0]
         self.voxel_y = voxel_size[1]
         self.voxel_z = voxel_size[2]
+        point_cloud_range = np.array(point_cloud_range).astype(np.float32)
         self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
         self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
         self.z_offset = self.voxel_z / 2 + point_cloud_range[2]
diff --git a/projects/DSVT/dsvt/res_second.py b/projects/DSVT/dsvt/res_second.py
index e1ddc1be6c..f8775e34e8 100644
--- a/projects/DSVT/dsvt/res_second.py
+++ b/projects/DSVT/dsvt/res_second.py
@@ -1,7 +1,5 @@
 # modified from https://github.com/Haiyang-W/DSVT
-
-import warnings
-from typing import Optional, Sequence, Tuple
+from typing import Sequence, Tuple
 
 from mmengine.model import BaseModule
 from torch import Tensor
@@ -78,8 +76,8 @@ class ResSECOND(BaseModule):
         out_channels (list[int]): Output channels for multi-scale feature maps.
         blocks_nums (list[int]): Number of blocks in each stage.
         layer_strides (list[int]): Strides of each stage.
-        norm_cfg (dict): Config dict of normalization layers.
-        conv_cfg (dict): Config dict of convolutional layers.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
     """
 
     def __init__(self,
@@ -87,8 +85,7 @@ def __init__(self,
                  out_channels: Sequence[int] = [128, 128, 256],
                  blocks_nums: Sequence[int] = [1, 2, 2],
                  layer_strides: Sequence[int] = [2, 2, 2],
-                 init_cfg: OptMultiConfig = None,
-                 pretrained: Optional[str] = None) -> None:
+                 init_cfg: OptMultiConfig = None) -> None:
         super(ResSECOND, self).__init__(init_cfg=init_cfg)
         assert len(layer_strides) == len(blocks_nums)
         assert len(out_channels) == len(blocks_nums)
@@ -108,14 +105,6 @@ def __init__(self,
                     BasicResBlock(out_channels[i], out_channels[i]))
             blocks.append(nn.Sequential(*cur_layers))
         self.blocks = nn.Sequential(*blocks)
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-        else:
-            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
 
     def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
         """Forward function.
diff --git a/projects/DSVT/dsvt/transforms_3d.py b/projects/DSVT/dsvt/transforms_3d.py
new file mode 100644
index 0000000000..ff0c9a2314
--- /dev/null
+++ b/projects/DSVT/dsvt/transforms_3d.py
@@ -0,0 +1,116 @@
+from typing import List
+
+import numpy as np
+from mmcv import BaseTransform
+
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class ObjectRangeFilter3D(BaseTransform):
+    """Filter objects by the range. It differs from `ObjectRangeFilter` by
+    using `in_range_3d` instead of `in_range_bev`.
+
+    Required Keys:
+
+    - gt_bboxes_3d
+
+    Modified Keys:
+
+    - gt_bboxes_3d
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+            keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_3d(self.pcd_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PointsRangeFilter3D(BaseTransform):
+    """Filter points by the range. It differs from `PointRangeFilter` by using
+    `in_range_bev` instead of `in_range_3d`.
+
+    Required Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Modified Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_bev(self.pcd_range[[0, 1, 3, 4]])
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
diff --git a/projects/DSVT/dsvt/utils.py b/projects/DSVT/dsvt/utils.py
index 7c40383ce7..706ee04280 100644
--- a/projects/DSVT/dsvt/utils.py
+++ b/projects/DSVT/dsvt/utils.py
@@ -3,10 +3,11 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from mmdet.models.losses.utils import weighted_loss
 from torch import Tensor
 
 from mmdet3d.models.task_modules import CenterPointBBoxCoder
-from mmdet3d.registry import TASK_UTILS
+from mmdet3d.registry import MODELS, TASK_UTILS
 from .ops.ingroup_inds.ingroup_inds_op import ingroup_inds
 
 get_inner_win_inds_cuda = ingroup_inds
@@ -266,7 +267,7 @@ def decode(self,
             thresh_mask = final_scores > self.score_threshold
 
         if self.post_center_range is not None:
-            self.post_center_range = torch.tensor(
+            self.post_center_range = torch.as_tensor(
                 self.post_center_range, device=heat.device)
             mask = (final_box_preds[..., :3] >=
                     self.post_center_range[:3]).all(2)
@@ -298,3 +299,142 @@ def decode(self,
                 'support post_center_range is not None for now!')
 
         return predictions_dicts
+
+
+def center_to_corner2d(center, dim):
+    corners_norm = torch.tensor(
+        [[-0.5, -0.5], [-0.5, 0.5], [0.5, 0.5], [0.5, -0.5]],
+        device=dim.device).type_as(center)  # (4, 2)
+    corners = dim.view([-1, 1, 2]) * corners_norm.view([1, 4, 2])  # (N, 4, 2)
+    corners = corners + center.view(-1, 1, 2)
+    return corners
+
+
+@weighted_loss
+def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
+    """
+    modified from https://github.com/agent-sgs/PillarNet/blob/master/det3d/core/utils/center_utils.py # noqa
+    Args:
+        pred_boxes (N, 7):
+        gt_boxes (N, 7):
+
+    Returns:
+        Tensor: Distance-IoU Loss.
+    """
+    assert pred_boxes.shape[0] == gt_boxes.shape[0]
+
+    qcorners = center_to_corner2d(pred_boxes[:, :2],
+                                  pred_boxes[:, 3:5])  # (N, 4, 2)
+    gcorners = center_to_corner2d(gt_boxes[:, :2], gt_boxes[:,
+                                                            3:5])  # (N, 4, 2)
+
+    inter_max_xy = torch.minimum(qcorners[:, 2], gcorners[:, 2])
+    inter_min_xy = torch.maximum(qcorners[:, 0], gcorners[:, 0])
+    out_max_xy = torch.maximum(qcorners[:, 2], gcorners[:, 2])
+    out_min_xy = torch.minimum(qcorners[:, 0], gcorners[:, 0])
+
+    # calculate area
+    volume_pred_boxes = pred_boxes[:, 3] * pred_boxes[:, 4] * pred_boxes[:, 5]
+    volume_gt_boxes = gt_boxes[:, 3] * gt_boxes[:, 4] * gt_boxes[:, 5]
+
+    inter_h = torch.minimum(
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5],
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5]) - torch.maximum(
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5],
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5])
+    inter_h = torch.clamp(inter_h, min=0)
+
+    inter = torch.clamp((inter_max_xy - inter_min_xy), min=0)
+    volume_inter = inter[:, 0] * inter[:, 1] * inter_h
+    volume_union = volume_gt_boxes + volume_pred_boxes - volume_inter + eps
+
+    # boxes_iou3d_gpu(pred_boxes, gt_boxes)
+    inter_diag = torch.pow(gt_boxes[:, 0:3] - pred_boxes[:, 0:3], 2).sum(-1)
+
+    outer_h = torch.maximum(
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5],
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5]) - torch.minimum(
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5],
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5])
+    outer_h = torch.clamp(outer_h, min=0)
+    outer = torch.clamp((out_max_xy - out_min_xy), min=0)
+    outer_diag = outer[:, 0]**2 + outer[:, 1]**2 + outer_h**2 + eps
+
+    dious = volume_inter / volume_union - inter_diag / outer_diag
+    dious = torch.clamp(dious, min=-1.0, max=1.0)
+
+    loss = 1 - dious
+
+    return loss
+
+
+@MODELS.register_module()
+class DIoU3DLoss(nn.Module):
+    r"""3D bboxes Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression <https://arxiv.org/abs/1911.08287>`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0). Defaults to 1e-6.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean".
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou3d_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
diff --git a/tools/train.py b/tools/train.py
index b2ced54b05..6b9c3b0842 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -21,6 +21,12 @@ def parse_args():
         action='store_true',
         default=False,
         help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--sync_bn',
+        choices=['none', 'torch', 'mmcv'],
+        default='none',
+        help='convert all BatchNorm layers in the model to SyncBatchNorm '
+        '(SyncBN) or mmcv.ops.sync_bn.SyncBatchNorm (MMSyncBN) layers.')
     parser.add_argument(
         '--auto-scale-lr',
         action='store_true',
@@ -98,6 +104,10 @@ def main():
             cfg.optim_wrapper.type = 'AmpOptimWrapper'
             cfg.optim_wrapper.loss_scale = 'dynamic'
 
+    # convert BatchNorm layers
+    if args.sync_bn != 'none':
+        cfg.sync_bn = args.sync_bn
+
     # enable automatically scaling LR
     if args.auto_scale_lr:
         if 'auto_scale_lr' in cfg and \

From 8deeb6e203f80329f33759c9e5a2ef6cb07902d3 Mon Sep 17 00:00:00 2001
From: Sun Jiahao <72679458+sunjiahao1999@users.noreply.github.com>
Date: Thu, 4 Jan 2024 12:08:14 +0800
Subject: [PATCH 4/7] [Fix] fix MinkUNet & SPVCNN configs (#2854)

---
 ...minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py | 2 +-
 ...minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py | 2 +-
 configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py     | 2 +-
 configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py     | 2 +-
 projects/DSVT/README.md                                     | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py b/configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py
index ac450bf03a..94fe3bf9a4 100644
--- a/configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py
+++ b/configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py
@@ -1,4 +1,4 @@
-_base_ = ['./minkunet_w32_8xb2-15e_semantickitti.py']
+_base_ = ['./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py']
 
 model = dict(
     backbone=dict(
diff --git a/configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py b/configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py
index 34c501f52a..cb2b3b3d37 100644
--- a/configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py
+++ b/configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py
@@ -1,4 +1,4 @@
-_base_ = ['./minkunet_w32_8xb2-15e_semantickitti.py']
+_base_ = ['./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py']
 
 model = dict(
     backbone=dict(
diff --git a/configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py b/configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py
index 2bfcd2dac3..51a14cb574 100644
--- a/configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py
+++ b/configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py
@@ -1,4 +1,4 @@
-_base_ = ['./spvcnn_w32_8xb2-15e_semantickitti.py']
+_base_ = ['./spvcnn_w32_8xb2-amp-15e_semantickitti.py']
 
 model = dict(
     backbone=dict(
diff --git a/configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py b/configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py
index cdafb16e1e..75ecc1477d 100644
--- a/configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py
+++ b/configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py
@@ -1,4 +1,4 @@
-_base_ = ['./spvcnn_w32_8xb2-15e_semantickitti.py']
+_base_ = ['./spvcnn_w32_8xb2-amp-15e_semantickitti.py']
 
 model = dict(
     backbone=dict(
diff --git a/projects/DSVT/README.md b/projects/DSVT/README.md
index d60e49abe5..e8c1262ba4 100644
--- a/projects/DSVT/README.md
+++ b/projects/DSVT/README.md
@@ -67,9 +67,9 @@ tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-c
 
 ### Waymo
 
-|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                             Download                                                                             |
-| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | \[log\](\<https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
+|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                           Download                                                                           |
+| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
 
 **Note**:
 

From 2dad86c23fed102e7bf107b41b13e9363d4704e4 Mon Sep 17 00:00:00 2001
From: YirongYan <yanyirong@sjtu.edu.cn>
Date: Thu, 4 Jan 2024 18:54:39 +0800
Subject: [PATCH 5/7] [Feature]Support NeRF-Det (#2732)

---
 projects/NeRF-Det/README.md                   | 115 ++++
 .../nerfdet_res101_2x_low_res_depth.py        | 198 ++++++
 .../configs/nerfdet_res50_2x_low_res.py       | 104 +++
 .../configs/nerfdet_res50_2x_low_res_depth.py | 198 ++++++
 projects/NeRF-Det/nerfdet/__init__.py         |  11 +
 .../NeRF-Det/nerfdet/data_preprocessor.py     | 583 ++++++++++++++++
 projects/NeRF-Det/nerfdet/formating.py        | 350 ++++++++++
 .../NeRF-Det/nerfdet/multiview_pipeline.py    | 297 ++++++++
 .../nerfdet/nerf_det3d_data_sample.py         |  52 ++
 .../NeRF-Det/nerfdet/nerf_utils/nerf_mlp.py   | 277 ++++++++
 .../NeRF-Det/nerfdet/nerf_utils/projection.py | 140 ++++
 .../NeRF-Det/nerfdet/nerf_utils/render_ray.py | 431 ++++++++++++
 .../nerfdet/nerf_utils/save_rendered_img.py   |  79 +++
 projects/NeRF-Det/nerfdet/nerfdet.py          | 632 ++++++++++++++++++
 projects/NeRF-Det/nerfdet/nerfdet_head.py     | 629 +++++++++++++++++
 .../nerfdet/scannet_multiview_dataset.py      | 202 ++++++
 projects/NeRF-Det/prepare_infos.py            | 151 +++++
 17 files changed, 4449 insertions(+)
 create mode 100644 projects/NeRF-Det/README.md
 create mode 100644 projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
 create mode 100644 projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
 create mode 100644 projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
 create mode 100644 projects/NeRF-Det/nerfdet/__init__.py
 create mode 100644 projects/NeRF-Det/nerfdet/data_preprocessor.py
 create mode 100644 projects/NeRF-Det/nerfdet/formating.py
 create mode 100644 projects/NeRF-Det/nerfdet/multiview_pipeline.py
 create mode 100644 projects/NeRF-Det/nerfdet/nerf_det3d_data_sample.py
 create mode 100644 projects/NeRF-Det/nerfdet/nerf_utils/nerf_mlp.py
 create mode 100644 projects/NeRF-Det/nerfdet/nerf_utils/projection.py
 create mode 100644 projects/NeRF-Det/nerfdet/nerf_utils/render_ray.py
 create mode 100644 projects/NeRF-Det/nerfdet/nerf_utils/save_rendered_img.py
 create mode 100644 projects/NeRF-Det/nerfdet/nerfdet.py
 create mode 100644 projects/NeRF-Det/nerfdet/nerfdet_head.py
 create mode 100644 projects/NeRF-Det/nerfdet/scannet_multiview_dataset.py
 create mode 100644 projects/NeRF-Det/prepare_infos.py

diff --git a/projects/NeRF-Det/README.md b/projects/NeRF-Det/README.md
new file mode 100644
index 0000000000..93119895e9
--- /dev/null
+++ b/projects/NeRF-Det/README.md
@@ -0,0 +1,115 @@
+# NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection
+
+> [NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection](https://arxiv.org/abs/2307.14620)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+NeRF-Det is a novel method for indoor 3D detection with posed RGB images as input. Unlike existing indoor 3D detection methods that struggle to model scene geometry, NeRF-Det makes novel use of NeRF in an end-to-end manner to explicitly estimate 3D geometry, thereby improving 3D detection performance. Specifically, to avoid the significant extra latency associated with per-scene optimization of NeRF, NeRF-Det introduce sufficient geometry priors to enhance the generalizability of NeRF-MLP. Furthermore, it subtly connect the detection and NeRF branches through a shared MLP, enabling an efficient adaptation of NeRF to detection and yielding geometry-aware volumetric representations for 3D detection. NeRF-Det outperforms state-of-the-arts by 3.9 mAP and 3.1 mAP on the ScanNet and ARKITScenes benchmarks, respectively. The author provide extensive analysis to shed light on how NeRF-Det works. As a result of joint-training design,  NeRF-Det is able to generalize well to unseen scenes for object detection, view synthesis, and depth estimation tasks without requiring per-scene optimization. Code will be available at https://github.com/facebookresearch/NeRF-Det
+
+<div align=center>
+<img src="https://chenfengxu714.github.io/nerfdet/static/images/method-cropped_1.png" width="800"/>
+</div>
+
+## Introduction
+
+This directory contains the implementations of NeRF-Det (https://arxiv.org/abs/2307.14620). Our implementations are built on top of MMdetection3D. We have updated NeRF-Det to be compatible with latest mmdet3d version. The codebase and config files have all changed to adapt to the new mmdet3d version. All previous pretrained models are verified with the result listed below. However, newly trained models are yet to be uploaded.
+
+<!-- Share any information you would like others to know. For example:
+Author: @xxx.
+This is an implementation of \[XXX\]. -->
+
+## Dataset
+
+The format of the scannet dataset in the latest version of mmdet3d only supports the lidar tasks. For NeRF-Det, we need to create the new format of ScanNet Dataset.
+
+Please following the files in mmdet3d to prepare the raw data of ScanNet. After that, please use this command to generate the pkls used in nerfdet.
+
+```bash
+python projects/NeRF-Det/prepare_infos.py --root-path ./data/scannet --out-dir ./data/scannet
+```
+
+The new format of the pkl is organized as below:
+
+- scannet_infos_train.pkl: The train data infos, the detailed info of each scan is as follows:
+  - info\['instances'\]:A list of dict contains all annotations, each dict contains all annotation information of single instance.For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 6 numbers representing the axis_aligned in depth coordinate system, in (x,y,z,l,w,h) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: The label of each 3d bounding boxes.
+  - info\['cam2img'\]: The intrinsic matrix.Every scene has one matrix.
+  - info\['lidar2cam'\]: The extrinsic matrixes.Every scene has 300 matrixes.
+  - info\['img_paths'\]: The paths of the 300 rgb pictures.
+  - info\['axis_align_matrix'\]: The align matrix.Every scene has one matrix.
+
+After preparing your scannet dataset pkls,please change the paths in configs to fit your project.
+
+## Train
+
+In MMDet3D's root directory, run the following command to train the model:
+
+```bash
+python tools/train.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${WORK_DIR}
+```
+
+## Results and Models
+
+### NeRF-Det
+
+|                            Backbone                             | mAP@25 | mAP@50 |    Log    |
+| :-------------------------------------------------------------: | :----: | :----: | :-------: |
+|      [NeRF-Det-R50](./configs/nerfdet_res50_2x_low_res.py)      |  53.0  |  26.8  | [log](<>) |
+|  [NeRF-Det-R50\*](./configs/nerfdet_res50_2x_low_res_depth.py)  |  52.2  |  28.5  | [log](<>) |
+| [NeRF-Det-R101\*](./configs/nerfdet_res101_2x_low_res_depth.py) |  52.3  |  28.5  | [log](<>) |
+
+(Here NeRF-Det-R50\* means this model uses depth information in the training step)
+
+### Notes
+
+- The values showed in the chart all represents the best mAP in the training.
+
+- Since there is a lot of randomness in the behavior of the model, we conducted three experiments on each config and took the average. The mAP showed on the above chart are all average values.
+
+- We also conducted the same experiments in the original code, the results are showed below.
+
+  |    Backbone     | mAP@25 | mAP@50 |
+  | :-------------: | :----: | :----: |
+  |  NeRF-Det-R50   |  52.8  |  26.8  |
+  | NeRF-Det-R50\*  |  52.4  |  27.5  |
+  | NeRF-Det-R101\* |  52.8  |  28.6  |
+
+- Attention: Because of the randomness in the construction of the ScanNet dataset itself and the behavior of the model, the training results will fluctuate considerably. According to experimental results and experience, the experimental results will fluctuate by plus or minus 1.5 points.
+
+## Evaluation using pretrained models
+
+1. Download the pretrained checkpoints through the linkings in the above chart.
+
+2. Testing
+
+   To test, use:
+
+   ```bash
+   python tools/test.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${CHECKPOINT_PATH}
+   ```
+
+## Citation
+
+<!-- You may remove this section if not applicable. -->
+
+```latex
+@inproceedings{
+  xu2023nerfdet,
+  title={NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection},
+  author={Xu, Chenfeng and Wu, Bichen and Hou, Ji and Tsai, Sam and Li, Ruilong and Wang, Jialiang and Zhan, Wei and He, Zijian and Vajda, Peter and Keutzer, Kurt and Tomizuka, Masayoshi},
+  booktitle={ICCV},
+  year={2023},
+}
+
+@inproceedings{
+park2023time,
+title={Time Will Tell: New Outlooks and A Baseline for Temporal Multi-View 3D Object Detection},
+author={Jinhyung Park and Chenfeng Xu and Shijia Yang and Kurt Keutzer and Kris M. Kitani and Masayoshi Tomizuka and Wei Zhan},
+booktitle={The Eleventh International Conference on Learning Representations },
+year={2023},
+url={https://openreview.net/forum?id=H3HcEJA2Um}
+}
+```
diff --git a/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py b/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
new file mode 100644
index 0000000000..b3c639f19e
--- /dev/null
+++ b/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
@@ -0,0 +1,198 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+
+custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
+    rotations=[.0])
+
+model = dict(
+    type='NerfDet',
+    data_preprocessor=dict(
+        type='NeRFDetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=10),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='NerfDetHead',
+        bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
+        n_classes=18,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=6,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    voxel_size=[.16, .16, .2],
+    n_voxels=[40, 40, 16],
+    aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
+    near_far_range=[0.2, 8.0],
+    N_samples=64,
+    N_rand=2048,
+    nerf_mode='image',
+    depth_supervise=True,
+    use_nerf_mask=True,
+    nerf_sample_view=20,
+    squeeze_scale=4,
+    nerf_density=True,
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(
+    use_camera=True,
+    use_depth=True,
+    use_lidar=False,
+    use_neuralrecon_depth=False,
+    use_ray=True)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
+    'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'depth',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=48,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=5,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+test_cfg = dict()
+val_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
diff --git a/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py b/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
new file mode 100644
index 0000000000..0321d54bba
--- /dev/null
+++ b/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
@@ -0,0 +1,104 @@
+_base_ = ['./nerfdet_res50_2x_low_res_depth.py']
+
+model = dict(depth_supervise=False)
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(use_depth=False)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'lightpos', 'nerf_sizes', 'raydirs',
+    'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=50,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
diff --git a/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py b/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
new file mode 100644
index 0000000000..0143a8084a
--- /dev/null
+++ b/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
@@ -0,0 +1,198 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+
+custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
+    rotations=[.0])
+
+model = dict(
+    type='NerfDet',
+    data_preprocessor=dict(
+        type='NeRFDetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=10),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='NerfDetHead',
+        bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
+        n_classes=18,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=6,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    voxel_size=[.16, .16, .2],
+    n_voxels=[40, 40, 16],
+    aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
+    near_far_range=[0.2, 8.0],
+    N_samples=64,
+    N_rand=2048,
+    nerf_mode='image',
+    depth_supervise=True,
+    use_nerf_mask=True,
+    nerf_sample_view=20,
+    squeeze_scale=4,
+    nerf_density=True,
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(
+    use_camera=True,
+    use_depth=True,
+    use_lidar=False,
+    use_neuralrecon_depth=False,
+    use_ray=True)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
+    'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'depth',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=50,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=5,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+test_cfg = dict()
+val_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
diff --git a/projects/NeRF-Det/nerfdet/__init__.py b/projects/NeRF-Det/nerfdet/__init__.py
new file mode 100644
index 0000000000..5ddef2f7be
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/__init__.py
@@ -0,0 +1,11 @@
+from .data_preprocessor import NeRFDetDataPreprocessor
+from .formating import PackNeRFDetInputs
+from .multiview_pipeline import MultiViewPipeline, RandomShiftOrigin
+from .nerfdet import NerfDet
+from .nerfdet_head import NerfDetHead
+from .scannet_multiview_dataset import MultiViewScanNetDataset
+
+__all__ = [
+    'MultiViewScanNetDataset', 'MultiViewPipeline', 'RandomShiftOrigin',
+    'PackNeRFDetInputs', 'NeRFDetDataPreprocessor', 'NerfDetHead', 'NerfDet'
+]
diff --git a/projects/NeRF-Det/nerfdet/data_preprocessor.py b/projects/NeRF-Det/nerfdet/data_preprocessor.py
new file mode 100644
index 0000000000..582a09f63c
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/data_preprocessor.py
@@ -0,0 +1,583 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from numbers import Number
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmdet.models import DetDataPreprocessor
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmengine.model import stack_batch
+from mmengine.utils import is_seq_of
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.models.data_preprocessors.utils import multiview_img_stack_batch
+from mmdet3d.models.data_preprocessors.voxelize import (
+    VoxelizationByGridShape, dynamic_scatter_3d)
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import OptConfigType
+
+
+@MODELS.register_module()
+class NeRFDetDataPreprocessor(DetDataPreprocessor):
+    """In NeRF-Det, some extra information is needed in NeRF branch. We put the
+    datapreprocessor operations of these new information such as stack and pack
+    operations in this class. You can find the stack operations in subfuction
+    'collate_data' and the pack operations in 'simple_process'. Other codes are
+    the same as the default class 'DetDataPreprocessor'.
+
+    Points / Image pre-processor for point clouds / vision-only / multi-
+    modality 3D detection tasks.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move image and point cloud data to the target device.
+
+    - 1) For image data:
+
+      - Pad images in inputs to the maximum size of current batch with defined
+        ``pad_value``. The padding size can be divisible by a defined
+        ``pad_size_divisor``.
+      - Stack images in inputs to batch_imgs.
+      - Convert images in inputs from bgr to rgb if the shape of input is
+        (3, H, W).
+      - Normalize images in inputs with defined std and mean.
+      - Do batch augmentations during training.
+
+    - 2) For point cloud data:
+
+      - If no voxelization, directly return list of point cloud data.
+      - If voxelization is applied, voxelize point cloud according to
+        ``voxel_type`` and obtain ``voxels``.
+
+    Args:
+        voxel (bool): Whether to apply voxelization to point cloud.
+            Defaults to False.
+        voxel_type (str): Voxelization type. Two voxelization types are
+            provided: 'hard' and 'dynamic', respectively for hard voxelization
+            and dynamic voxelization. Defaults to 'hard'.
+        voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
+            config. Defaults to None.
+        batch_first (bool): Whether to put the batch dimension to the first
+            dimension when getting voxel coordinates. Defaults to True.
+        max_voxels (int, optional): Maximum number of voxels in each voxel
+            grid. Defaults to None.
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be divisible by
+            ``pad_size_divisor``. Defaults to 1.
+        pad_value (float or int): The padded pixel value. Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic segmentation
+            maps. Defaults to 255.
+        bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
+            Defaults to False.
+        boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+            bboxes data to ``Tensor`` type. Defaults to True.
+        non_blocking (bool): Whether to block current process when transferring
+            data to device. Defaults to False.
+        batch_augments (List[dict], optional): Batch-level augmentations.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 voxel: bool = False,
+                 voxel_type: str = 'hard',
+                 voxel_layer: OptConfigType = None,
+                 batch_first: bool = True,
+                 max_voxels: Optional[int] = None,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 boxtype2tensor: bool = True,
+                 non_blocking: bool = False,
+                 batch_augments: Optional[List[dict]] = None) -> None:
+        super(NeRFDetDataPreprocessor, self).__init__(
+            mean=mean,
+            std=std,
+            pad_size_divisor=pad_size_divisor,
+            pad_value=pad_value,
+            pad_mask=pad_mask,
+            mask_pad_value=mask_pad_value,
+            pad_seg=pad_seg,
+            seg_pad_value=seg_pad_value,
+            bgr_to_rgb=bgr_to_rgb,
+            rgb_to_bgr=rgb_to_bgr,
+            boxtype2tensor=boxtype2tensor,
+            non_blocking=non_blocking,
+            batch_augments=batch_augments)
+        self.voxel = voxel
+        self.voxel_type = voxel_type
+        self.batch_first = batch_first
+        self.max_voxels = max_voxels
+        if voxel:
+            self.voxel_layer = VoxelizationByGridShape(**voxel_layer)
+
+    def forward(self,
+                data: Union[dict, List[dict]],
+                training: bool = False) -> Union[dict, List[dict]]:
+        """Perform normalization, padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict or List[dict]): Data from dataloader. The dict contains
+                the whole batch data, when it is a list[dict], the list
+                indicates test time augmentation.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict or List[dict]: Data in the same format as the model input.
+        """
+        if isinstance(data, list):
+            num_augs = len(data)
+            aug_batch_data = []
+            for aug_id in range(num_augs):
+                single_aug_batch_data = self.simple_process(
+                    data[aug_id], training)
+                aug_batch_data.append(single_aug_batch_data)
+            return aug_batch_data
+
+        else:
+            return self.simple_process(data, training)
+
+    def simple_process(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding and bgr2rgb conversion for img data
+        based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
+        is set to be True.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if 'img' in data['inputs']:
+            batch_pad_shape = self._get_pad_shape(data)
+
+        data = self.collate_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        batch_inputs = dict()
+
+        if 'points' in inputs:
+            batch_inputs['points'] = inputs['points']
+
+            if self.voxel:
+                voxel_dict = self.voxelize(inputs['points'], data_samples)
+                batch_inputs['voxels'] = voxel_dict
+
+        if 'imgs' in inputs:
+            imgs = inputs['imgs']
+
+            if data_samples is not None:
+                # NOTE the batched image size information may be useful, e.g.
+                # in DETR, this is needed for the construction of masks, which
+                # is then used for the transformer_head.
+                batch_input_shape = tuple(imgs[0].size()[-2:])
+                for data_sample, pad_shape in zip(data_samples,
+                                                  batch_pad_shape):
+                    data_sample.set_metainfo({
+                        'batch_input_shape': batch_input_shape,
+                        'pad_shape': pad_shape
+                    })
+
+                if self.boxtype2tensor:
+                    samplelist_boxtype2tensor(data_samples)
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples)
+                if self.pad_seg:
+                    self.pad_gt_sem_seg(data_samples)
+
+            if training and self.batch_augments is not None:
+                for batch_aug in self.batch_augments:
+                    imgs, data_samples = batch_aug(imgs, data_samples)
+            batch_inputs['imgs'] = imgs
+        # Hard code here, will be changed later.
+        # if len(inputs['depth']) != 0:
+        if 'depth' in inputs.keys():
+            batch_inputs['depth'] = inputs['depth']
+        batch_inputs['lightpos'] = inputs['lightpos']
+        batch_inputs['nerf_sizes'] = inputs['nerf_sizes']
+        batch_inputs['denorm_images'] = inputs['denorm_images']
+        batch_inputs['raydirs'] = inputs['raydirs']
+
+        return {'inputs': batch_inputs, 'data_samples': data_samples}
+
+    def preprocess_img(self, _batch_img: Tensor) -> Tensor:
+        # channel transform
+        if self._channel_conversion:
+            _batch_img = _batch_img[[2, 1, 0], ...]
+        # Convert to float after channel conversion to ensure
+        # efficiency
+        _batch_img = _batch_img.float()
+        # Normalization.
+        if self._enable_normalize:
+            if self.mean.shape[0] == 3:
+                assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
+                    'If the mean has 3 values, the input tensor '
+                    'should in shape of (3, H, W), but got the '
+                    f'tensor with shape {_batch_img.shape}')
+            _batch_img = (_batch_img - self.mean) / self.std
+        return _batch_img
+
+    def collate_data(self, data: dict) -> dict:
+        """Copy data to the target device and perform normalization, padding
+        and bgr2rgb conversion and stack based on ``BaseDataPreprocessor``.
+
+        Collates the data sampled from dataloader into a list of dict and list
+        of labels, and then copies tensor to the target device.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)  # type: ignore
+
+        if 'img' in data['inputs']:
+            _batch_imgs = data['inputs']['img']
+            # Process data with `pseudo_collate`.
+            if is_seq_of(_batch_imgs, torch.Tensor):
+                batch_imgs = []
+                img_dim = _batch_imgs[0].dim()
+                for _batch_img in _batch_imgs:
+                    if img_dim == 3:  # standard img
+                        _batch_img = self.preprocess_img(_batch_img)
+                    elif img_dim == 4:
+                        _batch_img = [
+                            self.preprocess_img(_img) for _img in _batch_img
+                        ]
+
+                        _batch_img = torch.stack(_batch_img, dim=0)
+
+                    batch_imgs.append(_batch_img)
+
+                # Pad and stack Tensor.
+                if img_dim == 3:
+                    batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
+                                             self.pad_value)
+                elif img_dim == 4:
+                    batch_imgs = multiview_img_stack_batch(
+                        batch_imgs, self.pad_size_divisor, self.pad_value)
+
+            # Process data with `default_collate`.
+            elif isinstance(_batch_imgs, torch.Tensor):
+                assert _batch_imgs.dim() == 4, (
+                    'The input of `ImgDataPreprocessor` should be a NCHW '
+                    'tensor or a list of tensor, but got a tensor with '
+                    f'shape: {_batch_imgs.shape}')
+                if self._channel_conversion:
+                    _batch_imgs = _batch_imgs[:, [2, 1, 0], ...]
+                # Convert to float after channel conversion to ensure
+                # efficiency
+                _batch_imgs = _batch_imgs.float()
+                if self._enable_normalize:
+                    _batch_imgs = (_batch_imgs - self.mean) / self.std
+                h, w = _batch_imgs.shape[2:]
+                target_h = math.ceil(
+                    h / self.pad_size_divisor) * self.pad_size_divisor
+                target_w = math.ceil(
+                    w / self.pad_size_divisor) * self.pad_size_divisor
+                pad_h = target_h - h
+                pad_w = target_w - w
+                batch_imgs = F.pad(_batch_imgs, (0, pad_w, 0, pad_h),
+                                   'constant', self.pad_value)
+            else:
+                raise TypeError(
+                    'Output of `cast_data` should be a list of dict '
+                    'or a tuple with inputs and data_samples, but got '
+                    f'{type(data)}: {data}')
+
+            data['inputs']['imgs'] = batch_imgs
+        if 'raydirs' in data['inputs']:
+            _batch_dirs = data['inputs']['raydirs']
+            batch_dirs = stack_batch(_batch_dirs)
+            data['inputs']['raydirs'] = batch_dirs
+
+        if 'lightpos' in data['inputs']:
+            _batch_poses = data['inputs']['lightpos']
+            batch_poses = stack_batch(_batch_poses)
+            data['inputs']['lightpos'] = batch_poses
+
+        if 'denorm_images' in data['inputs']:
+            _batch_denorm_imgs = data['inputs']['denorm_images']
+            # Process data with `pseudo_collate`.
+            if is_seq_of(_batch_denorm_imgs, torch.Tensor):
+                denorm_img_dim = _batch_denorm_imgs[0].dim()
+                # Pad and stack Tensor.
+                if denorm_img_dim == 3:
+                    batch_denorm_imgs = stack_batch(_batch_denorm_imgs,
+                                                    self.pad_size_divisor,
+                                                    self.pad_value)
+                elif denorm_img_dim == 4:
+                    batch_denorm_imgs = multiview_img_stack_batch(
+                        _batch_denorm_imgs, self.pad_size_divisor,
+                        self.pad_value)
+            data['inputs']['denorm_images'] = batch_denorm_imgs
+
+        data.setdefault('data_samples', None)
+
+        return data
+
+    def _get_pad_shape(self, data: dict) -> List[Tuple[int, int]]:
+        """Get the pad_shape of each image based on data and
+        pad_size_divisor."""
+        # rewrite `_get_pad_shape` for obtaining image inputs.
+        _batch_inputs = data['inputs']['img']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_pad_shape = []
+            for ori_input in _batch_inputs:
+                if ori_input.dim() == 4:
+                    # mean multiview input, select one of the
+                    # image to calculate the pad shape
+                    ori_input = ori_input[0]
+                pad_h = int(
+                    np.ceil(ori_input.shape[1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(ori_input.shape[2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                batch_pad_shape.append((pad_h, pad_w))
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            pad_h = int(
+                np.ceil(_batch_inputs.shape[1] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(
+                np.ceil(_batch_inputs.shape[2] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
+        else:
+            raise TypeError('Output of `cast_data` should be a list of dict '
+                            'or a tuple with inputs and data_samples, but got '
+                            f'{type(data)}: {data}')
+        return batch_pad_shape
+
+    @torch.no_grad()
+    def voxelize(self, points: List[Tensor],
+                 data_samples: SampleList) -> Dict[str, Tensor]:
+        """Apply voxelization to point cloud.
+
+        Args:
+            points (List[Tensor]): Point cloud in one data batch.
+            data_samples: (list[:obj:`NeRFDet3DDataSample`]): The annotation
+                data of every samples. Add voxel-wise annotation for
+                segmentation.
+
+        Returns:
+            Dict[str, Tensor]: Voxelization information.
+
+            - voxels (Tensor): Features of voxels, shape is MxNxC for hard
+              voxelization, NxC for dynamic voxelization.
+            - coors (Tensor): Coordinates of voxels, shape is Nx(1+NDim),
+              where 1 represents the batch index.
+            - num_points (Tensor, optional): Number of points in each voxel.
+            - voxel_centers (Tensor, optional): Centers of voxels.
+        """
+
+        voxel_dict = dict()
+
+        if self.voxel_type == 'hard':
+            voxels, coors, num_points, voxel_centers = [], [], [], []
+            for i, res in enumerate(points):
+                res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+                res_voxel_centers = (
+                    res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
+                        self.voxel_layer.voxel_size) + res_voxels.new_tensor(
+                            self.voxel_layer.point_cloud_range[0:3])
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+                num_points.append(res_num_points)
+                voxel_centers.append(res_voxel_centers)
+
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+            num_points = torch.cat(num_points, dim=0)
+            voxel_centers = torch.cat(voxel_centers, dim=0)
+
+            voxel_dict['num_points'] = num_points
+            voxel_dict['voxel_centers'] = voxel_centers
+        elif self.voxel_type == 'dynamic':
+            coors = []
+            # dynamic voxelization only provide a coors mapping
+            for i, res in enumerate(points):
+                res_coors = self.voxel_layer(res)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                coors.append(res_coors)
+            voxels = torch.cat(points, dim=0)
+            coors = torch.cat(coors, dim=0)
+        elif self.voxel_type == 'cylindrical':
+            voxels, coors = [], []
+            for i, (res, data_sample) in enumerate(zip(points, data_samples)):
+                rho = torch.sqrt(res[:, 0]**2 + res[:, 1]**2)
+                phi = torch.atan2(res[:, 1], res[:, 0])
+                polar_res = torch.stack((rho, phi, res[:, 2]), dim=-1)
+                min_bound = polar_res.new_tensor(
+                    self.voxel_layer.point_cloud_range[:3])
+                max_bound = polar_res.new_tensor(
+                    self.voxel_layer.point_cloud_range[3:])
+                try:  # only support PyTorch >= 1.9.0
+                    polar_res_clamp = torch.clamp(polar_res, min_bound,
+                                                  max_bound)
+                except TypeError:
+                    polar_res_clamp = polar_res.clone()
+                    for coor_idx in range(3):
+                        polar_res_clamp[:, coor_idx][
+                            polar_res[:, coor_idx] >
+                            max_bound[coor_idx]] = max_bound[coor_idx]
+                        polar_res_clamp[:, coor_idx][
+                            polar_res[:, coor_idx] <
+                            min_bound[coor_idx]] = min_bound[coor_idx]
+                res_coors = torch.floor(
+                    (polar_res_clamp - min_bound) / polar_res_clamp.new_tensor(
+                        self.voxel_layer.voxel_size)).int()
+                self.get_voxel_seg(res_coors, data_sample)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                res_voxels = torch.cat((polar_res, res[:, :2], res[:, 3:]),
+                                       dim=-1)
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+        elif self.voxel_type == 'minkunet':
+            voxels, coors = [], []
+            voxel_size = points[0].new_tensor(self.voxel_layer.voxel_size)
+            for i, (res, data_sample) in enumerate(zip(points, data_samples)):
+                res_coors = torch.round(res[:, :3] / voxel_size).int()
+                res_coors -= res_coors.min(0)[0]
+
+                res_coors_numpy = res_coors.cpu().numpy()
+                inds, point2voxel_map = self.sparse_quantize(
+                    res_coors_numpy, return_index=True, return_inverse=True)
+                point2voxel_map = torch.from_numpy(point2voxel_map).cuda()
+                if self.training and self.max_voxels is not None:
+                    if len(inds) > self.max_voxels:
+                        inds = np.random.choice(
+                            inds, self.max_voxels, replace=False)
+                inds = torch.from_numpy(inds).cuda()
+                if hasattr(data_sample.gt_pts_seg, 'pts_semantic_mask'):
+                    data_sample.gt_pts_seg.voxel_semantic_mask \
+                        = data_sample.gt_pts_seg.pts_semantic_mask[inds]
+                res_voxel_coors = res_coors[inds]
+                res_voxels = res[inds]
+                if self.batch_first:
+                    res_voxel_coors = F.pad(
+                        res_voxel_coors, (1, 0), mode='constant', value=i)
+                    data_sample.batch_idx = res_voxel_coors[:, 0]
+                else:
+                    res_voxel_coors = F.pad(
+                        res_voxel_coors, (0, 1), mode='constant', value=i)
+                    data_sample.batch_idx = res_voxel_coors[:, -1]
+                data_sample.point2voxel_map = point2voxel_map.long()
+                voxels.append(res_voxels)
+                coors.append(res_voxel_coors)
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+
+        else:
+            raise ValueError(f'Invalid voxelization type {self.voxel_type}')
+
+        voxel_dict['voxels'] = voxels
+        voxel_dict['coors'] = coors
+
+        return voxel_dict
+
+    def get_voxel_seg(self, res_coors: Tensor,
+                      data_sample: SampleList) -> None:
+        """Get voxel-wise segmentation label and point2voxel map.
+
+        Args:
+            res_coors (Tensor): The voxel coordinates of points, Nx3.
+            data_sample: (:obj:`NeRFDet3DDataSample`): The annotation data of
+                every samples. Add voxel-wise annotation forsegmentation.
+        """
+
+        if self.training:
+            pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
+            voxel_semantic_mask, _, point2voxel_map = dynamic_scatter_3d(
+                F.one_hot(pts_semantic_mask.long()).float(), res_coors, 'mean',
+                True)
+            voxel_semantic_mask = torch.argmax(voxel_semantic_mask, dim=-1)
+            data_sample.gt_pts_seg.voxel_semantic_mask = voxel_semantic_mask
+            data_sample.point2voxel_map = point2voxel_map
+        else:
+            pseudo_tensor = res_coors.new_ones([res_coors.shape[0], 1]).float()
+            _, _, point2voxel_map = dynamic_scatter_3d(pseudo_tensor,
+                                                       res_coors, 'mean', True)
+            data_sample.point2voxel_map = point2voxel_map
+
+    def ravel_hash(self, x: np.ndarray) -> np.ndarray:
+        """Get voxel coordinates hash for np.unique.
+
+        Args:
+            x (np.ndarray): The voxel coordinates of points, Nx3.
+
+        Returns:
+            np.ndarray: Voxels coordinates hash.
+        """
+        assert x.ndim == 2, x.shape
+
+        x = x - np.min(x, axis=0)
+        x = x.astype(np.uint64, copy=False)
+        xmax = np.max(x, axis=0).astype(np.uint64) + 1
+
+        h = np.zeros(x.shape[0], dtype=np.uint64)
+        for k in range(x.shape[1] - 1):
+            h += x[:, k]
+            h *= xmax[k + 1]
+        h += x[:, -1]
+        return h
+
+    def sparse_quantize(self,
+                        coords: np.ndarray,
+                        return_index: bool = False,
+                        return_inverse: bool = False) -> List[np.ndarray]:
+        """Sparse Quantization for voxel coordinates used in Minkunet.
+
+        Args:
+            coords (np.ndarray): The voxel coordinates of points, Nx3.
+            return_index (bool): Whether to return the indices of the unique
+                coords, shape (M,).
+            return_inverse (bool): Whether to return the indices of the
+                original coords, shape (N,).
+
+        Returns:
+            List[np.ndarray]: Return index and inverse map if return_index and
+            return_inverse is True.
+        """
+        _, indices, inverse_indices = np.unique(
+            self.ravel_hash(coords), return_index=True, return_inverse=True)
+        coords = coords[indices]
+
+        outputs = []
+        if return_index:
+            outputs += [indices]
+        if return_inverse:
+            outputs += [inverse_indices]
+        return outputs
diff --git a/projects/NeRF-Det/nerfdet/formating.py b/projects/NeRF-Det/nerfdet/formating.py
new file mode 100644
index 0000000000..6063d634cf
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/formating.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmcv import BaseTransform
+from mmengine.structures import InstanceData
+from numpy import dtype
+
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures import BaseInstance3DBoxes, PointData
+from mmdet3d.structures.points import BasePoints
+# from .det3d_data_sample import Det3DDataSample
+from .nerf_det3d_data_sample import NeRFDet3DDataSample
+
+
+def to_tensor(
+    data: Union[torch.Tensor, np.ndarray, Sequence, int,
+                float]) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: the converted data.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        if data.dtype is dtype('float64'):
+            data = data.astype(np.float32)
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmengine.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@TRANSFORMS.register_module()
+class PackNeRFDetInputs(BaseTransform):
+    INPUTS_KEYS = ['points', 'img']
+    NERF_INPUT_KEYS = [
+        'img', 'denorm_images', 'depth', 'lightpos', 'nerf_sizes', 'raydirs'
+    ]
+
+    INSTANCEDATA_3D_KEYS = [
+        'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d'
+    ]
+    INSTANCEDATA_2D_KEYS = [
+        'gt_bboxes',
+        'gt_bboxes_labels',
+    ]
+    NERF_3D_KEYS = ['gt_images', 'gt_depths']
+
+    SEG_KEYS = [
+        'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask',
+        'gt_semantic_seg'
+    ]
+
+    def __init__(
+        self,
+        keys: tuple,
+        meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'num_pts_feats', 'pcd_trans',
+                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
+                            'pcd_rotation_angle', 'lidar_path',
+                            'transformation_3d_flow', 'trans_mat',
+                            'affine_aug', 'sweep_img_metas', 'ori_cam2img',
+                            'cam2global', 'crop_offset', 'img_crop_offset',
+                            'resize_img_shape', 'lidar2cam', 'ori_lidar2img',
+                            'num_ref_frames', 'num_views', 'ego2global',
+                            'axis_align_matrix')
+    ) -> None:
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def _remove_prefix(self, key: str) -> str:
+        if key.startswith('gt_'):
+            key = key[3:]
+        return key
+
+    def transform(self, results: Union[dict,
+                                       List[dict]]) -> Union[dict, List[dict]]:
+        """Method to pack the input data. when the value in this dict is a
+        list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict | list[dict]): Result dict from the data pipeline.
+
+        Returns:
+            dict | List[dict]:
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`NeRFDet3DDataSample`): The annotation info
+              of the sample.
+        """
+        # augtest
+        if isinstance(results, list):
+            if len(results) == 1:
+                # simple test
+                return self.pack_single_results(results[0])
+            pack_results = []
+            for single_result in results:
+                pack_results.append(self.pack_single_results(single_result))
+            return pack_results
+        # norm training and simple testing
+        elif isinstance(results, dict):
+            return self.pack_single_results(results)
+        else:
+            raise NotImplementedError
+
+    def pack_single_results(self, results: dict) -> dict:
+        """Method to pack the single input data. when the value in this dict is
+        a list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict: A dict contains
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`NeRFDet3DDataSample`): The annotation info
+              of the sample.
+        """
+        # Format 3D data
+        if 'points' in results:
+            if isinstance(results['points'], BasePoints):
+                results['points'] = results['points'].tensor
+
+        if 'img' in results:
+            if isinstance(results['img'], list):
+                # process multiple imgs in single frame
+                imgs = np.stack(results['img'], axis=0)
+                if imgs.flags.c_contiguous:
+                    imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous()
+                else:
+                    imgs = to_tensor(
+                        np.ascontiguousarray(imgs.transpose(0, 3, 1, 2)))
+                results['img'] = imgs
+            else:
+                img = results['img']
+                if len(img.shape) < 3:
+                    img = np.expand_dims(img, -1)
+                # To improve the computational speed by by 3-5 times, apply:
+                # `torch.permute()` rather than `np.transpose()`.
+                # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+                # for more details
+                if img.flags.c_contiguous:
+                    img = to_tensor(img).permute(2, 0, 1).contiguous()
+                else:
+                    img = to_tensor(
+                        np.ascontiguousarray(img.transpose(2, 0, 1)))
+                results['img'] = img
+
+        if 'depth' in results:
+            if isinstance(results['depth'], list):
+                # process multiple depth imgs in single frame
+                depth_imgs = np.stack(results['depth'], axis=0)
+                if depth_imgs.flags.c_contiguous:
+                    depth_imgs = to_tensor(depth_imgs).contiguous()
+                else:
+                    depth_imgs = to_tensor(np.ascontiguousarray(depth_imgs))
+                results['depth'] = depth_imgs
+            else:
+                depth_img = results['depth']
+                if len(depth_img.shape) < 3:
+                    depth_img = np.expand_dims(depth_img, -1)
+                if depth_img.flags.c_contiguous:
+                    depth_img = to_tensor(depth_img).contiguous()
+                else:
+                    depth_img = to_tensor(np.ascontiguousarray(depth_img))
+                results['depth'] = depth_img
+
+        if 'ray_info' in results:
+            if isinstance(results['raydirs'], list):
+                raydirs = np.stack(results['raydirs'], axis=0)
+                if raydirs.flags.c_contiguous:
+                    raydirs = to_tensor(raydirs).contiguous()
+                else:
+                    raydirs = to_tensor(np.ascontiguousarray(raydirs))
+                results['raydirs'] = raydirs
+
+            if isinstance(results['lightpos'], list):
+                lightposes = np.stack(results['lightpos'], axis=0)
+                if lightposes.flags.c_contiguous:
+                    lightposes = to_tensor(lightposes).contiguous()
+                else:
+                    lightposes = to_tensor(np.ascontiguousarray(lightposes))
+                lightposes = lightposes.unsqueeze(1).repeat(
+                    1, raydirs.shape[1], 1)
+                results['lightpos'] = lightposes
+
+            if isinstance(results['gt_images'], list):
+                gt_images = np.stack(results['gt_images'], axis=0)
+                if gt_images.flags.c_contiguous:
+                    gt_images = to_tensor(gt_images).contiguous()
+                else:
+                    gt_images = to_tensor(np.ascontiguousarray(gt_images))
+                results['gt_images'] = gt_images
+
+            if isinstance(results['gt_depths'],
+                          list) and len(results['gt_depths']) != 0:
+                gt_depths = np.stack(results['gt_depths'], axis=0)
+                if gt_depths.flags.c_contiguous:
+                    gt_depths = to_tensor(gt_depths).contiguous()
+                else:
+                    gt_depths = to_tensor(np.ascontiguousarray(gt_depths))
+                results['gt_depths'] = gt_depths
+
+            if isinstance(results['denorm_images'], list):
+                denorm_imgs = np.stack(results['denorm_images'], axis=0)
+                if denorm_imgs.flags.c_contiguous:
+                    denorm_imgs = to_tensor(denorm_imgs).permute(
+                        0, 3, 1, 2).contiguous()
+                else:
+                    denorm_imgs = to_tensor(
+                        np.ascontiguousarray(
+                            denorm_imgs.transpose(0, 3, 1, 2)))
+                results['denorm_images'] = denorm_imgs
+
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+                'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = [to_tensor(res) for res in results[key]]
+            else:
+                results[key] = to_tensor(results[key])
+        if 'gt_bboxes_3d' in results:
+            if not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])
+
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = to_tensor(
+                results['gt_semantic_seg'][None])
+        if 'gt_seg_map' in results:
+            results['gt_seg_map'] = results['gt_seg_map'][None, ...]
+
+        if 'gt_images' in results:
+            results['gt_images'] = to_tensor(results['gt_images'])
+        if 'gt_depths' in results:
+            results['gt_depths'] = to_tensor(results['gt_depths'])
+
+        data_sample = NeRFDet3DDataSample()
+        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()
+        gt_pts_seg = PointData()
+        gt_nerf_images = InstanceData()
+        gt_nerf_depths = InstanceData()
+
+        data_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                data_metas[key] = results[key]
+            elif 'images' in results:
+                if len(results['images'].keys()) == 1:
+                    cam_type = list(results['images'].keys())[0]
+                    # single-view image
+                    if key in results['images'][cam_type]:
+                        data_metas[key] = results['images'][cam_type][key]
+                else:
+                    # multi-view image
+                    img_metas = []
+                    cam_types = list(results['images'].keys())
+                    for cam_type in cam_types:
+                        if key in results['images'][cam_type]:
+                            img_metas.append(results['images'][cam_type][key])
+                    if len(img_metas) > 0:
+                        data_metas[key] = img_metas
+            elif 'lidar_points' in results:
+                if key in results['lidar_points']:
+                    data_metas[key] = results['lidar_points'][key]
+        data_sample.set_metainfo(data_metas)
+
+        inputs = {}
+        for key in self.keys:
+            if key in results:
+                # if key in self.INPUTS_KEYS:
+                if key in self.NERF_INPUT_KEYS:
+                    inputs[key] = results[key]
+                elif key in self.NERF_3D_KEYS:
+                    if key == 'gt_images':
+                        gt_nerf_images[self._remove_prefix(key)] = results[key]
+                    else:
+                        gt_nerf_depths[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_3D_KEYS:
+                    gt_instances_3d[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_2D_KEYS:
+                    if key == 'gt_bboxes_labels':
+                        gt_instances['labels'] = results[key]
+                    else:
+                        gt_instances[self._remove_prefix(key)] = results[key]
+                elif key in self.SEG_KEYS:
+                    gt_pts_seg[self._remove_prefix(key)] = results[key]
+                else:
+                    raise NotImplementedError(f'Please modified '
+                                              f'`Pack3DDetInputs` '
+                                              f'to put {key} to '
+                                              f'corresponding field')
+
+        data_sample.gt_instances_3d = gt_instances_3d
+        data_sample.gt_instances = gt_instances
+        data_sample.gt_pts_seg = gt_pts_seg
+        data_sample.gt_nerf_images = gt_nerf_images
+        data_sample.gt_nerf_depths = gt_nerf_depths
+        if 'eval_ann_info' in results:
+            data_sample.eval_ann_info = results['eval_ann_info']
+        else:
+            data_sample.eval_ann_info = None
+
+        packed_results = dict()
+        packed_results['data_samples'] = data_sample
+        packed_results['inputs'] = inputs
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(keys={self.keys})'
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/projects/NeRF-Det/nerfdet/multiview_pipeline.py b/projects/NeRF-Det/nerfdet/multiview_pipeline.py
new file mode 100644
index 0000000000..23e84ed71f
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/multiview_pipeline.py
@@ -0,0 +1,297 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform, Compose
+from PIL import Image
+
+from mmdet3d.registry import TRANSFORMS
+
+
+def get_dtu_raydir(pixelcoords, intrinsic, rot, dir_norm=None):
+    # rot is c2w
+    # pixelcoords: H x W x 2
+    x = (pixelcoords[..., 0] + 0.5 - intrinsic[0, 2]) / intrinsic[0, 0]
+    y = (pixelcoords[..., 1] + 0.5 - intrinsic[1, 2]) / intrinsic[1, 1]
+    z = np.ones_like(x)
+    dirs = np.stack([x, y, z], axis=-1)
+    # dirs = np.sum(dirs[...,None,:] * rot[:,:], axis=-1) # h*w*1*3   x   3*3
+    dirs = dirs @ rot[:, :].T  #
+    if dir_norm:
+        dirs = dirs / (np.linalg.norm(dirs, axis=-1, keepdims=True) + 1e-5)
+
+    return dirs
+
+
+@TRANSFORMS.register_module()
+class MultiViewPipeline(BaseTransform):
+    """MultiViewPipeline used in nerfdet.
+
+    Required Keys:
+
+    - depth_info
+    - img_prefix
+    - img_info
+    - lidar2img
+    - c2w
+    - cammrotc2w
+    - lightpos
+    - ray_info
+
+    Modified Keys:
+
+    - lidar2img
+
+    Added Keys:
+
+    - img
+    - denorm_images
+    - depth
+    - c2w
+    - camrotc2w
+    - lightpos
+    - pixels
+    - raydirs
+    - gt_images
+    - gt_depths
+    - nerf_sizes
+    - depth_range
+
+    Args:
+        transforms (list[dict]): The transform pipeline
+            used to process the imgs.
+        n_images (int): The number of sampled views.
+        mean (array): The mean values used in normalization.
+        std (array): The variance values used in normalization.
+        margin (int): The margin value. Defaults to 10.
+        depth_range (array): The range of the depth.
+            Defaults to [0.5, 5.5].
+        loading (str): The mode of loading. Defaults to 'random'.
+        nerf_target_views (int): The number of novel views.
+        sample_freq (int): The frequency of sampling.
+    """
+
+    def __init__(self,
+                 transforms: dict,
+                 n_images: int,
+                 mean: tuple = [123.675, 116.28, 103.53],
+                 std: tuple = [58.395, 57.12, 57.375],
+                 margin: int = 10,
+                 depth_range: tuple = [0.5, 5.5],
+                 loading: str = 'random',
+                 nerf_target_views: int = 0,
+                 sample_freq: int = 3):
+        self.transforms = Compose(transforms)
+        self.depth_transforms = Compose(transforms[1])
+        self.n_images = n_images
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.margin = margin
+        self.depth_range = depth_range
+        self.loading = loading
+        self.sample_freq = sample_freq
+        self.nerf_target_views = nerf_target_views
+
+    def transform(self, results: dict) -> dict:
+        """Nerfdet transform function.
+
+        Args:
+            results (dict): Result dict from loading pipeline
+
+        Returns:
+            dict: The result dict containing the processed results.
+            Updated key and value are described below.
+
+                - img (list): The loaded origin image.
+                - denorm_images (list): The denormalized image.
+                - depth (list): The origin depth image.
+                - c2w (list): The c2w matrixes.
+                - camrotc2w (list): The rotation matrixes.
+                - lightpos (list): The transform parameters of the camera.
+                - pixels (list): Some pixel information.
+                - raydirs (list): The ray-directions.
+                - gt_images (list): The groundtruth images.
+                - gt_depths (list): The groundtruth depth images.
+                - nerf_sizes (array): The size of the groundtruth images.
+                - depth_range (array): The range of the depth.
+
+        Here we give a detailed explanation of some keys mentioned above.
+        Let P_c be the coordinate of camera, P_w be the coordinate of world.
+        There is such a conversion relationship: P_c = R @ P_w + T.
+        The 'camrotc2w' mentioned above corresponds to the R matrix here.
+        The 'lightpos' corresponds to the T matrix here. And if you put
+        R and T together, you can get the camera extrinsics matrix. It
+        corresponds to the 'c2w' mentioned above.
+        """
+        imgs = []
+        depths = []
+        extrinsics = []
+        c2ws = []
+        camrotc2ws = []
+        lightposes = []
+        pixels = []
+        raydirs = []
+        gt_images = []
+        gt_depths = []
+        denorm_imgs_list = []
+        nerf_sizes = []
+
+        if self.loading == 'random':
+            ids = np.arange(len(results['img_info']))
+            replace = True if self.n_images > len(ids) else False
+            ids = np.random.choice(ids, self.n_images, replace=replace)
+            if self.nerf_target_views != 0:
+                target_id = np.random.choice(
+                    ids, self.nerf_target_views, replace=False)
+                ids = np.setdiff1d(ids, target_id)
+                ids = ids.tolist()
+                target_id = target_id.tolist()
+
+        else:
+            ids = np.arange(len(results['img_info']))
+            begin_id = 0
+            ids = np.arange(begin_id,
+                            begin_id + self.n_images * self.sample_freq,
+                            self.sample_freq)
+            if self.nerf_target_views != 0:
+                target_id = ids
+
+        ratio = 0
+        size = (240, 320)
+        for i in ids:
+            _results = dict()
+            _results['img_path'] = results['img_info'][i]['filename']
+            _results = self.transforms(_results)
+            imgs.append(_results['img'])
+            # normalize
+            for key in _results.get('img_fields', ['img']):
+                _results[key] = mmcv.imnormalize(_results[key], self.mean,
+                                                 self.std, True)
+            _results['img_norm_cfg'] = dict(
+                mean=self.mean, std=self.std, to_rgb=True)
+            # pad
+            for key in _results.get('img_fields', ['img']):
+                padded_img = mmcv.impad(_results[key], shape=size, pad_val=0)
+                _results[key] = padded_img
+            _results['pad_shape'] = padded_img.shape
+            _results['pad_fixed_size'] = size
+            ori_shape = _results['ori_shape']
+            aft_shape = _results['img_shape']
+            ratio = ori_shape[0] / aft_shape[0]
+            # prepare the depth information
+            if 'depth_info' in results.keys():
+                if '.npy' in results['depth_info'][i]['filename']:
+                    _results['depth'] = np.load(
+                        results['depth_info'][i]['filename'])
+                else:
+                    _results['depth'] = np.asarray((Image.open(
+                        results['depth_info'][i]['filename']))) / 1000
+                    _results['depth'] = mmcv.imresize(
+                        _results['depth'], (aft_shape[1], aft_shape[0]))
+                depths.append(_results['depth'])
+
+            denorm_img = mmcv.imdenormalize(
+                _results['img'], self.mean, self.std, to_bgr=True).astype(
+                    np.uint8) / 255.0
+            denorm_imgs_list.append(denorm_img)
+            height, width = padded_img.shape[:2]
+            extrinsics.append(results['lidar2img']['extrinsic'][i])
+
+        # prepare the nerf information
+        if 'ray_info' in results.keys():
+            intrinsics_nerf = results['lidar2img']['intrinsic'].copy()
+            intrinsics_nerf[:2] = intrinsics_nerf[:2] / ratio
+            assert self.nerf_target_views > 0
+            for i in target_id:
+                c2ws.append(results['c2w'][i])
+                camrotc2ws.append(results['camrotc2w'][i])
+                lightposes.append(results['lightpos'][i])
+                px, py = np.meshgrid(
+                    np.arange(self.margin,
+                              width - self.margin).astype(np.float32),
+                    np.arange(self.margin,
+                              height - self.margin).astype(np.float32))
+                pixelcoords = np.stack((px, py),
+                                       axis=-1).astype(np.float32)  # H x W x 2
+                pixels.append(pixelcoords)
+                raydir = get_dtu_raydir(pixelcoords, intrinsics_nerf,
+                                        results['camrotc2w'][i])
+                raydirs.append(np.reshape(raydir.astype(np.float32), (-1, 3)))
+                # read target images
+                temp_results = dict()
+                temp_results['img_path'] = results['img_info'][i]['filename']
+
+                temp_results_ = self.transforms(temp_results)
+                # normalize
+                for key in temp_results.get('img_fields', ['img']):
+                    temp_results[key] = mmcv.imnormalize(
+                        temp_results[key], self.mean, self.std, True)
+                temp_results['img_norm_cfg'] = dict(
+                    mean=self.mean, std=self.std, to_rgb=True)
+                # pad
+                for key in temp_results.get('img_fields', ['img']):
+                    padded_img = mmcv.impad(
+                        temp_results[key], shape=size, pad_val=0)
+                    temp_results[key] = padded_img
+                temp_results['pad_shape'] = padded_img.shape
+                temp_results['pad_fixed_size'] = size
+                # denormalize target_images.
+                denorm_imgs = mmcv.imdenormalize(
+                    temp_results_['img'], self.mean, self.std,
+                    to_bgr=True).astype(np.uint8)
+                gt_rgb_shape = denorm_imgs.shape
+
+                gt_image = denorm_imgs[py.astype(np.int32),
+                                       px.astype(np.int32), :]
+                nerf_sizes.append(np.array(gt_image.shape))
+                gt_image = np.reshape(gt_image, (-1, 3))
+                gt_images.append(gt_image / 255.0)
+                if 'depth_info' in results.keys():
+                    if '.npy' in results['depth_info'][i]['filename']:
+                        _results['depth'] = np.load(
+                            results['depth_info'][i]['filename'])
+                    else:
+                        depth_image = Image.open(
+                            results['depth_info'][i]['filename'])
+                        _results['depth'] = np.asarray(depth_image) / 1000
+                        _results['depth'] = mmcv.imresize(
+                            _results['depth'],
+                            (gt_rgb_shape[1], gt_rgb_shape[0]))
+
+                    _results['depth'] = _results['depth']
+                    gt_depth = _results['depth'][py.astype(np.int32),
+                                                 px.astype(np.int32)]
+                    gt_depths.append(gt_depth)
+
+        for key in _results.keys():
+            if key not in ['img', 'img_info']:
+                results[key] = _results[key]
+        results['img'] = imgs
+
+        if 'ray_info' in results.keys():
+            results['c2w'] = c2ws
+            results['camrotc2w'] = camrotc2ws
+            results['lightpos'] = lightposes
+            results['pixels'] = pixels
+            results['raydirs'] = raydirs
+            results['gt_images'] = gt_images
+            results['gt_depths'] = gt_depths
+            results['nerf_sizes'] = nerf_sizes
+            results['denorm_images'] = denorm_imgs_list
+            results['depth_range'] = np.array([self.depth_range])
+
+        if len(depths) != 0:
+            results['depth'] = depths
+        results['lidar2img']['extrinsic'] = extrinsics
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomShiftOrigin(BaseTransform):
+
+    def __init__(self, std):
+        self.std = std
+
+    def transform(self, results):
+        shift = np.random.normal(.0, self.std, 3)
+        results['lidar2img']['origin'] += shift
+        return results
diff --git a/projects/NeRF-Det/nerfdet/nerf_det3d_data_sample.py b/projects/NeRF-Det/nerfdet/nerf_det3d_data_sample.py
new file mode 100644
index 0000000000..439e9a69ba
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/nerf_det3d_data_sample.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.structures import Det3DDataSample
+
+
+class NeRFDet3DDataSample(Det3DDataSample):
+    """A data structure interface inheirted from Det3DDataSample. Some new
+    attributes are added to match the NeRF-Det project.
+
+    The attributes added in ``NeRFDet3DDataSample`` are divided into two parts:
+
+        - ``gt_nerf_images`` (InstanceData): Ground truth of the images which
+          will be used in the NeRF branch.
+        - ``gt_nerf_depths`` (InstanceData): Ground truth of the depth images
+          which will be used in the NeRF branch if needed.
+
+    For more details and examples, please refer to the 'Det3DDataSample' file.
+    """
+
+    @property
+    def gt_nerf_images(self) -> InstanceData:
+        return self._gt_nerf_images
+
+    @gt_nerf_images.setter
+    def gt_nerf_images(self, value: InstanceData) -> None:
+        self.set_field(value, '_gt_nerf_images', dtype=InstanceData)
+
+    @gt_nerf_images.deleter
+    def gt_nerf_images(self) -> None:
+        del self._gt_nerf_images
+
+    @property
+    def gt_nerf_depths(self) -> InstanceData:
+        return self._gt_nerf_depths
+
+    @gt_nerf_depths.setter
+    def gt_nerf_depths(self, value: InstanceData) -> None:
+        self.set_field(value, '_gt_nerf_depths', dtype=InstanceData)
+
+    @gt_nerf_depths.deleter
+    def gt_nerf_depths(self) -> None:
+        del self._gt_nerf_depths
+
+
+SampleList = List[NeRFDet3DDataSample]
+OptSampleList = Optional[SampleList]
+ForwardResults = Union[Dict[str, torch.Tensor], List[NeRFDet3DDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/projects/NeRF-Det/nerfdet/nerf_utils/nerf_mlp.py b/projects/NeRF-Det/nerfdet/nerf_utils/nerf_mlp.py
new file mode 100644
index 0000000000..cc579ea23b
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/nerf_utils/nerf_mlp.py
@@ -0,0 +1,277 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MLP(nn.Module):
+    """The MLP module used in NerfDet.
+
+    Args:
+        input_dim (int): The number of input tensor channels.
+        output_dim (int): The number of output tensor channels.
+        net_depth (int): The depth of the MLP. Defaults to 8.
+        net_width (int): The width of the MLP. Defaults to 256.
+        skip_layer (int): The layer to add skip layers to. Defaults to 4.
+
+        hidden_init (Callable): The initialize method of the hidden layers.
+        hidden_activation (Callable): The activation function of hidden
+            layers, defaults to ReLU.
+        output_enabled (bool): If true, the output layers will be used.
+            Defaults to True.
+        output_init (Optional): The initialize method of the output layer.
+        output_activation(Optional): The activation function of output layers.
+        bias_enabled (Bool): If true, the bias will be used.
+        bias_init (Callable): The initialize method of the bias.
+            Defaults to True.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int = None,
+        net_depth: int = 8,
+        net_width: int = 256,
+        skip_layer: int = 4,
+        hidden_init: Callable = nn.init.xavier_uniform_,
+        hidden_activation: Callable = nn.ReLU(),
+        output_enabled: bool = True,
+        output_init: Optional[Callable] = nn.init.xavier_uniform_,
+        output_activation: Optional[Callable] = nn.Identity(),
+        bias_enabled: bool = True,
+        bias_init: Callable = nn.init.zeros_,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.net_depth = net_depth
+        self.net_width = net_width
+        self.skip_layer = skip_layer
+        self.hidden_init = hidden_init
+        self.hidden_activation = hidden_activation
+        self.output_enabled = output_enabled
+        self.output_init = output_init
+        self.output_activation = output_activation
+        self.bias_enabled = bias_enabled
+        self.bias_init = bias_init
+
+        self.hidden_layers = nn.ModuleList()
+        in_features = self.input_dim
+        for i in range(self.net_depth):
+            self.hidden_layers.append(
+                nn.Linear(in_features, self.net_width, bias=bias_enabled))
+            if (self.skip_layer is not None) and (i % self.skip_layer
+                                                  == 0) and (i > 0):
+                in_features = self.net_width + self.input_dim
+            else:
+                in_features = self.net_width
+        if self.output_enabled:
+            self.output_layer = nn.Linear(
+                in_features, self.output_dim, bias=bias_enabled)
+        else:
+            self.output_dim = in_features
+
+        self.initialize()
+
+    def initialize(self):
+
+        def init_func_hidden(m):
+            if isinstance(m, nn.Linear):
+                if self.hidden_init is not None:
+                    self.hidden_init(m.weight)
+                if self.bias_enabled and self.bias_init is not None:
+                    self.bias_init(m.bias)
+
+        self.hidden_layers.apply(init_func_hidden)
+        if self.output_enabled:
+
+            def init_func_output(m):
+                if isinstance(m, nn.Linear):
+                    if self.output_init is not None:
+                        self.output_init(m.weight)
+                    if self.bias_enabled and self.bias_init is not None:
+                        self.bias_init(m.bias)
+
+            self.output_layer.apply(init_func_output)
+
+    def forward(self, x):
+        inputs = x
+        for i in range(self.net_depth):
+            x = self.hidden_layers[i](x)
+            x = self.hidden_activation(x)
+            if (self.skip_layer is not None) and (i % self.skip_layer
+                                                  == 0) and (i > 0):
+                x = torch.cat([x, inputs], dim=-1)
+        if self.output_enabled:
+            x = self.output_layer(x)
+            x = self.output_activation(x)
+        return x
+
+
+class DenseLayer(MLP):
+
+    def __init__(self, input_dim, output_dim, **kwargs):
+        super().__init__(
+            input_dim=input_dim,
+            output_dim=output_dim,
+            net_depth=0,  # no hidden layers
+            **kwargs,
+        )
+
+
+class NerfMLP(nn.Module):
+    """The Nerf-MLP Module.
+
+    Args:
+        input_dim (int): The number of input tensor channels.
+        condition_dim (int): The number of condition tensor channels.
+        feature_dim (int): The number of feature channels. Defaults to 0.
+        net_depth (int): The depth of the MLP. Defaults to 8.
+        net_width (int): The width of the MLP. Defaults to 256.
+        skip_layer (int): The layer to add skip layers to. Defaults to 4.
+        net_depth_condition (int): The depth of the second part of MLP.
+            Defaults to 1.
+        net_width_condition (int): The width of the second part of MLP.
+            Defaults to 128.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        condition_dim: int,
+        feature_dim: int = 0,
+        net_depth: int = 8,
+        net_width: int = 256,
+        skip_layer: int = 4,
+        net_depth_condition: int = 1,
+        net_width_condition: int = 128,
+    ):
+        super().__init__()
+        self.base = MLP(
+            input_dim=input_dim + feature_dim,
+            net_depth=net_depth,
+            net_width=net_width,
+            skip_layer=skip_layer,
+            output_enabled=False,
+        )
+        hidden_features = self.base.output_dim
+        self.sigma_layer = DenseLayer(hidden_features, 1)
+
+        if condition_dim > 0:
+            self.bottleneck_layer = DenseLayer(hidden_features, net_width)
+            self.rgb_layer = MLP(
+                input_dim=net_width + condition_dim,
+                output_dim=3,
+                net_depth=net_depth_condition,
+                net_width=net_width_condition,
+                skip_layer=None,
+            )
+        else:
+            self.rgb_layer = DenseLayer(hidden_features, 3)
+
+    def query_density(self, x, features=None):
+        """Calculate the raw sigma."""
+        if features is not None:
+            x = self.base(torch.cat([x, features], dim=-1))
+        else:
+            x = self.base(x)
+        raw_sigma = self.sigma_layer(x)
+        return raw_sigma
+
+    def forward(self, x, condition=None, features=None):
+        if features is not None:
+            x = self.base(torch.cat([x, features], dim=-1))
+        else:
+            x = self.base(x)
+        raw_sigma = self.sigma_layer(x)
+        if condition is not None:
+            if condition.shape[:-1] != x.shape[:-1]:
+                num_rays, n_dim = condition.shape
+                condition = condition.view(
+                    [num_rays] + [1] * (x.dim() - condition.dim()) +
+                    [n_dim]).expand(list(x.shape[:-1]) + [n_dim])
+            bottleneck = self.bottleneck_layer(x)
+            x = torch.cat([bottleneck, condition], dim=-1)
+        raw_rgb = self.rgb_layer(x)
+        return raw_rgb, raw_sigma
+
+
+class SinusoidalEncoder(nn.Module):
+    """Sinusodial Positional Encoder used in NeRF."""
+
+    def __init__(self, x_dim, min_deg, max_deg, use_identity: bool = True):
+        super().__init__()
+        self.x_dim = x_dim
+        self.min_deg = min_deg
+        self.max_deg = max_deg
+        self.use_identity = use_identity
+        self.register_buffer(
+            'scales', torch.tensor([2**i for i in range(min_deg, max_deg)]))
+
+    @property
+    def latent_dim(self) -> int:
+        return (int(self.use_identity) +
+                (self.max_deg - self.min_deg) * 2) * self.x_dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.max_deg == self.min_deg:
+            return x
+        xb = torch.reshape(
+            (x[Ellipsis, None, :] * self.scales[:, None]),
+            list(x.shape[:-1]) + [(self.max_deg - self.min_deg) * self.x_dim],
+        )
+        latent = torch.sin(torch.cat([xb, xb + 0.5 * math.pi], dim=-1))
+        if self.use_identity:
+            latent = torch.cat([x] + [latent], dim=-1)
+        return latent
+
+
+class VanillaNeRF(nn.Module):
+    """The Nerf-MLP with the positional encoder.
+
+    Args:
+        net_depth (int): The depth of the MLP. Defaults to 8.
+        net_width (int): The width of the MLP. Defaults to 256.
+        skip_layer (int): The layer to add skip layers to. Defaults to 4.
+        feature_dim (int): The number of feature channels. Defaults to 0.
+        net_depth_condition (int): The depth of the second part of MLP.
+            Defaults to 1.
+        net_width_condition (int): The width of the second part of MLP.
+            Defaults to 128.
+    """
+
+    def __init__(self,
+                 net_depth: int = 8,
+                 net_width: int = 256,
+                 skip_layer: int = 4,
+                 feature_dim: int = 0,
+                 net_depth_condition: int = 1,
+                 net_width_condition: int = 128):
+        super().__init__()
+        self.posi_encoder = SinusoidalEncoder(3, 0, 10, True)
+        self.view_encoder = SinusoidalEncoder(3, 0, 4, True)
+        self.mlp = NerfMLP(
+            input_dim=self.posi_encoder.latent_dim,
+            condition_dim=self.view_encoder.latent_dim,
+            feature_dim=feature_dim,
+            net_depth=net_depth,
+            net_width=net_width,
+            skip_layer=skip_layer,
+            net_depth_condition=net_depth_condition,
+            net_width_condition=net_width_condition,
+        )
+
+    def query_density(self, x, features=None):
+        x = self.posi_encoder(x)
+        sigma = self.mlp.query_density(x, features)
+        return F.relu(sigma)
+
+    def forward(self, x, condition=None, features=None):
+        x = self.posi_encoder(x)
+        if condition is not None:
+            condition = self.view_encoder(condition)
+        rgb, sigma = self.mlp(x, condition=condition, features=features)
+        return torch.sigmoid(rgb), F.relu(sigma)
diff --git a/projects/NeRF-Det/nerfdet/nerf_utils/projection.py b/projects/NeRF-Det/nerfdet/nerf_utils/projection.py
new file mode 100644
index 0000000000..d88e281420
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/nerf_utils/projection.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Attention: This file is mainly modified based on the file with the same
+# name in the original project. For more details, please refer to the
+# origin project.
+import torch
+import torch.nn.functional as F
+
+
+class Projector():
+
+    def __init__(self, device='cuda'):
+        self.device = device
+
+    def inbound(self, pixel_locations, h, w):
+        """check if the pixel locations are in valid range."""
+        return (pixel_locations[..., 0] <= w - 1.) & \
+               (pixel_locations[..., 0] >= 0) & \
+               (pixel_locations[..., 1] <= h - 1.) &\
+               (pixel_locations[..., 1] >= 0)
+
+    def normalize(self, pixel_locations, h, w):
+        resize_factor = torch.tensor([w - 1., h - 1.
+                                      ]).to(pixel_locations.device)[None,
+                                                                    None, :]
+        normalized_pixel_locations = 2 * pixel_locations / resize_factor - 1.
+        return normalized_pixel_locations
+
+    def compute_projections(self, xyz, train_cameras):
+        """project 3D points into cameras."""
+
+        original_shape = xyz.shape[:2]
+        xyz = xyz.reshape(-1, 3)
+        num_views = len(train_cameras)
+        train_intrinsics = train_cameras[:, 2:18].reshape(-1, 4, 4)
+        train_poses = train_cameras[:, -16:].reshape(-1, 4, 4)
+        xyz_h = torch.cat([xyz, torch.ones_like(xyz[..., :1])], dim=-1)
+        # projections = train_intrinsics.bmm(torch.inverse(train_poses))
+        # we have inverse the pose in dataloader so
+        # do not need to inverse here.
+        projections = train_intrinsics.bmm(train_poses) \
+            .bmm(xyz_h.t()[None, ...].repeat(num_views, 1, 1))
+        projections = projections.permute(0, 2, 1)
+        pixel_locations = projections[..., :2] / torch.clamp(
+            projections[..., 2:3], min=1e-8)
+        pixel_locations = torch.clamp(pixel_locations, min=-1e6, max=1e6)
+        mask = projections[..., 2] > 0
+        return pixel_locations.reshape((num_views, ) + original_shape + (2, )), \
+               mask.reshape((num_views, ) + original_shape) # noqa
+
+    def compute_angle(self, xyz, query_camera, train_cameras):
+
+        original_shape = xyz.shape[:2]
+        xyz = xyz.reshape(-1, 3)
+        train_poses = train_cameras[:, -16:].reshape(-1, 4, 4)
+        num_views = len(train_poses)
+        query_pose = query_camera[-16:].reshape(-1, 4,
+                                                4).repeat(num_views, 1, 1)
+        ray2tar_pose = (query_pose[:, :3, 3].unsqueeze(1) - xyz.unsqueeze(0))
+        ray2tar_pose /= (torch.norm(ray2tar_pose, dim=-1, keepdim=True) + 1e-6)
+        ray2train_pose = (
+            train_poses[:, :3, 3].unsqueeze(1) - xyz.unsqueeze(0))
+        ray2train_pose /= (
+            torch.norm(ray2train_pose, dim=-1, keepdim=True) + 1e-6)
+        ray_diff = ray2tar_pose - ray2train_pose
+        ray_diff_norm = torch.norm(ray_diff, dim=-1, keepdim=True)
+        ray_diff_dot = torch.sum(
+            ray2tar_pose * ray2train_pose, dim=-1, keepdim=True)
+        ray_diff_direction = ray_diff / torch.clamp(ray_diff_norm, min=1e-6)
+        ray_diff = torch.cat([ray_diff_direction, ray_diff_dot], dim=-1)
+        ray_diff = ray_diff.reshape((num_views, ) + original_shape + (4, ))
+        return ray_diff
+
+    def compute(self,
+                xyz,
+                train_imgs,
+                train_cameras,
+                featmaps=None,
+                grid_sample=True):
+
+        assert (train_imgs.shape[0] == 1) \
+               and (train_cameras.shape[0] == 1)
+        # only support batch_size=1 for now
+
+        train_imgs = train_imgs.squeeze(0)
+        train_cameras = train_cameras.squeeze(0)
+
+        train_imgs = train_imgs.permute(0, 3, 1, 2)
+        h, w = train_cameras[0][:2]
+
+        # compute the projection of the query points to each reference image
+        pixel_locations, mask_in_front = self.compute_projections(
+            xyz, train_cameras)
+        normalized_pixel_locations = self.normalize(pixel_locations, h, w)
+        # rgb sampling
+        rgbs_sampled = F.grid_sample(
+            train_imgs, normalized_pixel_locations, align_corners=True)
+        rgb_sampled = rgbs_sampled.permute(2, 3, 0, 1)
+
+        # deep feature sampling
+        if featmaps is not None:
+            if grid_sample:
+                feat_sampled = F.grid_sample(
+                    featmaps, normalized_pixel_locations, align_corners=True)
+                feat_sampled = feat_sampled.permute(
+                    2, 3, 0, 1)  # [n_rays, n_samples, n_views, d]
+                rgb_feat_sampled = torch.cat(
+                    [rgb_sampled, feat_sampled],
+                    dim=-1)  # [n_rays, n_samples, n_views, d+3]
+                # rgb_feat_sampled = feat_sampled
+            else:
+                n_images, n_channels, f_h, f_w = featmaps.shape
+                resize_factor = torch.tensor([f_w / w - 1., f_h / h - 1.]).to(
+                    pixel_locations.device)[None, None, :]
+                sample_location = (pixel_locations *
+                                   resize_factor).round().long()
+                n_images, n_ray, n_sample, _ = sample_location.shape
+                sample_x = sample_location[..., 0].view(n_images, -1)
+                sample_y = sample_location[..., 1].view(n_images, -1)
+                valid = (sample_x >= 0) & (sample_y >=
+                                           0) & (sample_x < f_w) & (
+                                               sample_y < f_h)
+                valid = valid * mask_in_front.view(n_images, -1)
+                feat_sampled = torch.zeros(
+                    (n_images, n_channels, sample_x.shape[-1]),
+                    device=featmaps.device)
+                for i in range(n_images):
+                    feat_sampled[i, :,
+                                 valid[i]] = featmaps[i, :, sample_y[i,
+                                                                     valid[i]],
+                                                      sample_y[i, valid[i]]]
+                feat_sampled = feat_sampled.view(n_images, n_channels, n_ray,
+                                                 n_sample)
+                rgb_feat_sampled = feat_sampled.permute(2, 3, 0, 1)
+
+        else:
+            rgb_feat_sampled = None
+        inbound = self.inbound(pixel_locations, h, w)
+        mask = (inbound * mask_in_front).float().permute(
+            1, 2, 0)[..., None]  # [n_rays, n_samples, n_views, 1]
+        return rgb_feat_sampled, mask
diff --git a/projects/NeRF-Det/nerfdet/nerf_utils/render_ray.py b/projects/NeRF-Det/nerfdet/nerf_utils/render_ray.py
new file mode 100644
index 0000000000..76582c5773
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/nerf_utils/render_ray.py
@@ -0,0 +1,431 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Attention: This file is mainly modified based on the file with the same
+# name in the original project. For more details, please refer to the
+# origin project.
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+rng = np.random.RandomState(234)
+
+
+# helper functions for nerf ray rendering
+def volume_sampling(sample_pts, features, aabb):
+    B, C, D, W, H = features.shape
+    assert B == 1
+    aabb = torch.Tensor(aabb).to(sample_pts.device)
+    N_rays, N_samples, coords = sample_pts.shape
+    sample_pts = sample_pts.view(1, N_rays * N_samples, 1, 1,
+                                 3).repeat(B, 1, 1, 1, 1)
+    aabbSize = aabb[1] - aabb[0]
+    invgridSize = 1.0 / aabbSize * 2
+    norm_pts = (sample_pts - aabb[0]) * invgridSize - 1
+    sample_features = F.grid_sample(
+        features, norm_pts, align_corners=True, padding_mode='border')
+    masks = ((norm_pts < 1) & (norm_pts > -1)).float().sum(dim=-1)
+    masks = (masks.view(N_rays, N_samples) == 3)
+    return sample_features.view(C, N_rays,
+                                N_samples).permute(1, 2, 0).contiguous(), masks
+
+
+def _compute_projection(img_meta):
+    views = len(img_meta['lidar2img']['extrinsic'])
+    intrinsic = torch.tensor(img_meta['lidar2img']['intrinsic'][:4, :4])
+    ratio = img_meta['ori_shape'][0] / img_meta['img_shape'][0]
+    intrinsic[:2] /= ratio
+    intrinsic = intrinsic.unsqueeze(0).view(1, 16).repeat(views, 1)
+    img_size = torch.Tensor(img_meta['img_shape'][:2]).to(intrinsic.device)
+    img_size = img_size.unsqueeze(0).repeat(views, 1)
+    extrinsics = []
+    for v in range(views):
+        extrinsics.append(
+            torch.Tensor(img_meta['lidar2img']['extrinsic'][v]).to(
+                intrinsic.device))
+    extrinsic = torch.stack(extrinsics).view(views, 16)
+    train_cameras = torch.cat([img_size, intrinsic, extrinsic], dim=-1)
+    return train_cameras.unsqueeze(0)
+
+
+def compute_mask_points(feature, mask):
+    weight = mask / (torch.sum(mask, dim=2, keepdim=True) + 1e-8)
+    mean = torch.sum(feature * weight, dim=2, keepdim=True)
+    var = torch.sum((feature - mean)**2, dim=2, keepdim=True)
+    var = var / (torch.sum(mask, dim=2, keepdim=True) + 1e-8)
+    var = torch.exp(-var)
+    return mean, var
+
+
+def sample_pdf(bins, weights, N_samples, det=False):
+    """Helper function used for sampling.
+
+    Args:
+        bins (tensor):Tensor of shape [N_rays, M+1], M is the number of bins
+        weights (tensor):Tensor of shape [N_rays, M+1], M is the number of bins
+        N_samples (int):Number of samples along each ray
+        det (bool):If True, will perform deterministic sampling
+
+    Returns:
+        samples (tuple): [N_rays, N_samples]
+    """
+
+    M = weights.shape[1]
+    weights += 1e-5
+    # Get pdf
+    pdf = weights / torch.sum(weights, dim=-1, keepdim=True)
+    cdf = torch.cumsum(pdf, dim=-1)
+    cdf = torch.cat([torch.zeros_like(cdf[:, 0:1]), cdf], dim=-1)
+
+    # Take uniform samples
+    if det:
+        u = torch.linspace(0., 1., N_samples, device=bins.device)
+        u = u.unsqueeze(0).repeat(bins.shape[0], 1)
+    else:
+        u = torch.rand(bins.shape[0], N_samples, device=bins.device)
+
+    # Invert CDF
+    above_inds = torch.zeros_like(u, dtype=torch.long)
+    for i in range(M):
+        above_inds += (u >= cdf[:, i:i + 1]).long()
+
+    # random sample inside each bin
+    below_inds = torch.clamp(above_inds - 1, min=0)
+    inds_g = torch.stack((below_inds, above_inds), dim=2)
+
+    cdf = cdf.unsqueeze(1).repeat(1, N_samples, 1)
+    cdf_g = torch.gather(input=cdf, dim=-1, index=inds_g)
+
+    bins = bins.unsqueeze(1).repeat(1, N_samples, 1)
+    bins_g = torch.gather(input=bins, dim=-1, index=inds_g)
+
+    denom = cdf_g[:, :, 1] - cdf_g[:, :, 0]
+    denom = torch.where(denom < 1e-5, torch.ones_like(denom), denom)
+    t = (u - cdf_g[:, :, 0]) / denom
+
+    samples = bins_g[:, :, 0] + t * (bins_g[:, :, 1] - bins_g[:, :, 0])
+
+    return samples
+
+
+def sample_along_camera_ray(ray_o,
+                            ray_d,
+                            depth_range,
+                            N_samples,
+                            inv_uniform=False,
+                            det=False):
+    """Sampling along the camera ray.
+
+    Args:
+        ray_o (tensor): Origin of the ray in scene coordinate system;
+            tensor of shape [N_rays, 3]
+        ray_d (tensor): Homogeneous ray direction vectors in
+            scene coordinate system; tensor of shape [N_rays, 3]
+        depth_range (tuple): [near_depth, far_depth]
+        inv_uniform (bool): If True,uniformly sampling inverse depth.
+        det (bool): If True, will perform deterministic sampling.
+    Returns:
+        pts (tensor): Tensor of shape [N_rays, N_samples, 3]
+        z_vals (tensor): Tensor of shape [N_rays, N_samples]
+    """
+    # will sample inside [near_depth, far_depth]
+    # assume the nearest possible depth is at least (min_ratio * depth)
+    near_depth_value = depth_range[0]
+    far_depth_value = depth_range[1]
+    assert near_depth_value > 0 and far_depth_value > 0 \
+        and far_depth_value > near_depth_value
+
+    near_depth = near_depth_value * torch.ones_like(ray_d[..., 0])
+
+    far_depth = far_depth_value * torch.ones_like(ray_d[..., 0])
+
+    if inv_uniform:
+        start = 1. / near_depth
+        step = (1. / far_depth - start) / (N_samples - 1)
+        inv_z_vals = torch.stack([start + i * step for i in range(N_samples)],
+                                 dim=1)
+        z_vals = 1. / inv_z_vals
+    else:
+        start = near_depth
+        step = (far_depth - near_depth) / (N_samples - 1)
+        z_vals = torch.stack([start + i * step for i in range(N_samples)],
+                             dim=1)
+
+    if not det:
+        # get intervals between samples
+        mids = .5 * (z_vals[:, 1:] + z_vals[:, :-1])
+        upper = torch.cat([mids, z_vals[:, -1:]], dim=-1)
+        lower = torch.cat([z_vals[:, 0:1], mids], dim=-1)
+        # uniform samples in those intervals
+        t_rand = torch.rand_like(z_vals)
+        z_vals = lower + (upper - lower) * t_rand
+
+    ray_d = ray_d.unsqueeze(1).repeat(1, N_samples, 1)
+    ray_o = ray_o.unsqueeze(1).repeat(1, N_samples, 1)
+    pts = z_vals.unsqueeze(2) * ray_d + ray_o  # [N_rays, N_samples, 3]
+    return pts, z_vals
+
+
+# ray rendering of nerf
+def raw2outputs(raw, z_vals, mask, white_bkgd=False):
+    """Transform raw data to outputs:
+
+    Args:
+        raw(tensor):Raw network output.Tensor of shape [N_rays, N_samples, 4]
+        z_vals(tensor):Depth of point samples along rays.
+            Tensor of shape [N_rays, N_samples]
+        ray_d(tensor):[N_rays, 3]
+
+    Returns:
+        ret(dict):
+            -rgb(tensor):[N_rays, 3]
+            -depth(tensor):[N_rays,]
+            -weights(tensor):[N_rays,]
+            -depth_std(tensor):[N_rays,]
+    """
+    rgb = raw[:, :, :3]  # [N_rays, N_samples, 3]
+    sigma = raw[:, :, 3]  # [N_rays, N_samples]
+
+    # note: we did not use the intervals here,
+    # because in practice different scenes from COLMAP can have
+    # very different scales, and using interval can affect
+    # the model's generalization ability.
+    # Therefore we don't use the intervals for both training and evaluation.
+    sigma2alpha = lambda sigma, dists: 1. - torch.exp(-sigma)  # noqa
+
+    # point samples are ordered with increasing depth
+    # interval between samples
+    dists = z_vals[:, 1:] - z_vals[:, :-1]
+    dists = torch.cat((dists, dists[:, -1:]), dim=-1)
+
+    alpha = sigma2alpha(sigma, dists)
+
+    T = torch.cumprod(1. - alpha + 1e-10, dim=-1)[:, :-1]
+    T = torch.cat((torch.ones_like(T[:, 0:1]), T), dim=-1)
+
+    # maths show weights, and summation of weights along a ray,
+    # are always inside [0, 1]
+    weights = alpha * T
+    rgb_map = torch.sum(weights.unsqueeze(2) * rgb, dim=1)
+
+    if white_bkgd:
+        rgb_map = rgb_map + (1. - torch.sum(weights, dim=-1, keepdim=True))
+
+    if mask is not None:
+        mask = mask.float().sum(dim=1) > 8
+
+    depth_map = torch.sum(
+        weights * z_vals, dim=-1) / (
+            torch.sum(weights, dim=-1) + 1e-8)
+    depth_map = torch.clamp(depth_map, z_vals.min(), z_vals.max())
+
+    ret = OrderedDict([('rgb', rgb_map), ('depth', depth_map),
+                       ('weights', weights), ('mask', mask), ('alpha', alpha),
+                       ('z_vals', z_vals), ('transparency', T)])
+
+    return ret
+
+
+def render_rays_func(
+        ray_o,
+        ray_d,
+        mean_volume,
+        cov_volume,
+        features_2D,
+        img,
+        aabb,
+        near_far_range,
+        N_samples,
+        N_rand=4096,
+        nerf_mlp=None,
+        img_meta=None,
+        projector=None,
+        mode='volume',  # volume and image
+        nerf_sample_view=3,
+        inv_uniform=False,
+        N_importance=0,
+        det=False,
+        is_train=True,
+        white_bkgd=False,
+        gt_rgb=None,
+        gt_depth=None):
+
+    ret = {
+        'outputs_coarse': None,
+        'outputs_fine': None,
+        'gt_rgb': gt_rgb,
+        'gt_depth': gt_depth
+    }
+
+    # pts: [N_rays, N_samples, 3]
+    # z_vals: [N_rays, N_samples]
+    pts, z_vals = sample_along_camera_ray(
+        ray_o=ray_o,
+        ray_d=ray_d,
+        depth_range=near_far_range,
+        N_samples=N_samples,
+        inv_uniform=inv_uniform,
+        det=det)
+    N_rays, N_samples = pts.shape[:2]
+
+    if mode == 'image':
+        img = img.permute(0, 2, 3, 1).unsqueeze(0)
+        train_camera = _compute_projection(img_meta).to(img.device)
+        rgb_feat, mask = projector.compute(
+            pts, img, train_camera, features_2D, grid_sample=True)
+        pixel_mask = mask[..., 0].sum(dim=2) > 1
+        mean, var = compute_mask_points(rgb_feat, mask)
+        globalfeat = torch.cat([mean, var], dim=-1).squeeze(2)
+        rgb_pts, density_pts = nerf_mlp(pts, ray_d, globalfeat)
+        raw_coarse = torch.cat([rgb_pts, density_pts], dim=-1)
+        ret['sigma'] = density_pts
+
+    elif mode == 'volume':
+        mean_pts, inbound_masks = volume_sampling(pts, mean_volume, aabb)
+        cov_pts, inbound_masks = volume_sampling(pts, cov_volume, aabb)
+        # This masks is for indicating which points outside of aabb
+        img = img.permute(0, 2, 3, 1).unsqueeze(0)
+        train_camera = _compute_projection(img_meta).to(img.device)
+        _, view_mask = projector.compute(pts, img, train_camera, None)
+        pixel_mask = view_mask[..., 0].sum(dim=2) > 1
+        # plot_3D_vis(pts, aabb, img, train_camera)
+        # [N_rays, N_samples], should at least have 2 observations
+        # This mask is for indicating which points do not have projected point
+        globalpts = torch.cat([mean_pts, cov_pts], dim=-1)
+        rgb_pts, density_pts = nerf_mlp(pts, ray_d, globalpts)
+        density_pts = density_pts * inbound_masks.unsqueeze(dim=-1)
+
+        raw_coarse = torch.cat([rgb_pts, density_pts], dim=-1)
+
+    outputs_coarse = raw2outputs(
+        raw_coarse, z_vals, pixel_mask, white_bkgd=white_bkgd)
+    ret['outputs_coarse'] = outputs_coarse
+
+    return ret
+
+
+def render_rays(
+        ray_batch,
+        mean_volume,
+        cov_volume,
+        features_2D,
+        img,
+        aabb,
+        near_far_range,
+        N_samples,
+        N_rand=4096,
+        nerf_mlp=None,
+        img_meta=None,
+        projector=None,
+        mode='volume',  # volume and image
+        nerf_sample_view=3,
+        inv_uniform=False,
+        N_importance=0,
+        det=False,
+        is_train=True,
+        white_bkgd=False,
+        render_testing=False):
+    """The function of the nerf rendering."""
+
+    ray_o = ray_batch['ray_o']
+    ray_d = ray_batch['ray_d']
+    gt_rgb = ray_batch['gt_rgb']
+    gt_depth = ray_batch['gt_depth']
+    nerf_sizes = ray_batch['nerf_sizes']
+    if is_train:
+        ray_o = ray_o.view(-1, 3)
+        ray_d = ray_d.view(-1, 3)
+        gt_rgb = gt_rgb.view(-1, 3)
+        if gt_depth.shape[1] != 0:
+            gt_depth = gt_depth.view(-1, 1)
+            non_zero_depth = (gt_depth > 0).squeeze(-1)
+            ray_o = ray_o[non_zero_depth]
+            ray_d = ray_d[non_zero_depth]
+            gt_rgb = gt_rgb[non_zero_depth]
+            gt_depth = gt_depth[non_zero_depth]
+        else:
+            gt_depth = None
+        total_rays = ray_d.shape[0]
+        select_inds = rng.choice(total_rays, size=(N_rand, ), replace=False)
+        ray_o = ray_o[select_inds]
+        ray_d = ray_d[select_inds]
+        gt_rgb = gt_rgb[select_inds]
+        if gt_depth is not None:
+            gt_depth = gt_depth[select_inds]
+
+        rets = render_rays_func(
+            ray_o,
+            ray_d,
+            mean_volume,
+            cov_volume,
+            features_2D,
+            img,
+            aabb,
+            near_far_range,
+            N_samples,
+            N_rand,
+            nerf_mlp,
+            img_meta,
+            projector,
+            mode,  # volume and image
+            nerf_sample_view,
+            inv_uniform,
+            N_importance,
+            det,
+            is_train,
+            white_bkgd,
+            gt_rgb,
+            gt_depth)
+
+    elif render_testing:
+        nerf_size = nerf_sizes[0]
+        view_num = ray_o.shape[1]
+        H = nerf_size[0][0]
+        W = nerf_size[0][1]
+        ray_o = ray_o.view(-1, 3)
+        ray_d = ray_d.view(-1, 3)
+        gt_rgb = gt_rgb.view(-1, 3)
+        print(gt_rgb.shape)
+        if len(gt_depth) != 0:
+            gt_depth = gt_depth.view(-1, 1)
+        else:
+            gt_depth = None
+        assert view_num * H * W == ray_o.shape[0]
+        num_rays = ray_o.shape[0]
+        results = []
+        rgbs = []
+        for i in range(0, num_rays, N_rand):
+            ray_o_chunck = ray_o[i:i + N_rand, :]
+            ray_d_chunck = ray_d[i:i + N_rand, :]
+
+            ret = render_rays_func(ray_o_chunck, ray_d_chunck, mean_volume,
+                                   cov_volume, features_2D, img, aabb,
+                                   near_far_range, N_samples, N_rand, nerf_mlp,
+                                   img_meta, projector, mode, nerf_sample_view,
+                                   inv_uniform, N_importance, True, is_train,
+                                   white_bkgd, gt_rgb, gt_depth)
+            results.append(ret)
+
+        rgbs = []
+        depths = []
+
+        if results[0]['outputs_coarse'] is not None:
+            for i in range(len(results)):
+                rgb = results[i]['outputs_coarse']['rgb']
+                rgbs.append(rgb)
+                depth = results[i]['outputs_coarse']['depth']
+                depths.append(depth)
+
+        rets = {
+            'outputs_coarse': {
+                'rgb': torch.cat(rgbs, dim=0).view(view_num, H, W, 3),
+                'depth': torch.cat(depths, dim=0).view(view_num, H, W, 1),
+            },
+            'gt_rgb':
+            gt_rgb.view(view_num, H, W, 3),
+            'gt_depth':
+            gt_depth.view(view_num, H, W, 1) if gt_depth is not None else None,
+        }
+    else:
+        rets = None
+    return rets
diff --git a/projects/NeRF-Det/nerfdet/nerf_utils/save_rendered_img.py b/projects/NeRF-Det/nerfdet/nerf_utils/save_rendered_img.py
new file mode 100644
index 0000000000..f9de3e3107
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/nerf_utils/save_rendered_img.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import cv2
+import numpy as np
+import torch
+from skimage.metrics import structural_similarity
+
+
+def compute_psnr_from_mse(mse):
+    return -10.0 * torch.log(mse) / np.log(10.0)
+
+
+def compute_psnr(pred, target, mask=None):
+    """Compute psnr value (we assume the maximum pixel value is 1)."""
+    if mask is not None:
+        pred, target = pred[mask], target[mask]
+    mse = ((pred - target)**2).mean()
+    return compute_psnr_from_mse(mse).cpu().numpy()
+
+
+def compute_ssim(pred, target, mask=None):
+    """Computes Masked SSIM following the neuralbody paper."""
+    assert pred.shape == target.shape and pred.shape[-1] == 3
+    if mask is not None:
+        x, y, w, h = cv2.boundingRect(mask.cpu().numpy().astype(np.uint8))
+        pred = pred[y:y + h, x:x + w]
+        target = target[y:y + h, x:x + w]
+    try:
+        ssim = structural_similarity(
+            pred.cpu().numpy(), target.cpu().numpy(), channel_axis=-1)
+    except ValueError:
+        ssim = structural_similarity(
+            pred.cpu().numpy(), target.cpu().numpy(), multichannel=True)
+    return ssim
+
+
+def save_rendered_img(img_meta, rendered_results):
+    filename = img_meta[0]['filename']
+    scenes = filename.split('/')[-2]
+
+    for ret in rendered_results:
+        depth = ret['outputs_coarse']['depth']
+        rgb = ret['outputs_coarse']['rgb']
+        gt = ret['gt_rgb']
+        gt_depth = ret['gt_depth']
+
+    # save images
+    psnr_total = 0
+    ssim_total = 0
+    rsme = 0
+    for v in range(gt.shape[0]):
+        rsme += ((depth[v] - gt_depth[v])**2).cpu().numpy()
+        depth_ = ((depth[v] - depth[v].min()) /
+                  (depth[v].max() - depth[v].min() + 1e-8)).repeat(1, 1, 3)
+        img_to_save = torch.cat([rgb[v], gt[v], depth_], dim=1)
+        image_path = os.path.join('nerf_vs_rebuttal', scenes)
+        if not os.path.exists(image_path):
+            os.makedirs(image_path)
+        save_dir = os.path.join(image_path, 'view_' + str(v) + '.png')
+
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        org = (50, 50)
+        fontScale = 1
+        color = (255, 0, 0)
+        thickness = 2
+        image = np.uint8(img_to_save.cpu().numpy() * 255.0)
+        psnr = compute_psnr(rgb[v], gt[v], mask=None)
+        psnr_total += psnr
+        ssim = compute_ssim(rgb[v], gt[v], mask=None)
+        ssim_total += ssim
+        image = cv2.putText(
+            image, 'PSNR: ' + '%.2f' % compute_psnr(rgb[v], gt[v], mask=None),
+            org, font, fontScale, color, thickness, cv2.LINE_AA)
+
+        cv2.imwrite(save_dir, image)
+
+    return psnr_total / gt.shape[0], ssim_total / gt.shape[0], rsme / gt.shape[
+        0]
diff --git a/projects/NeRF-Det/nerfdet/nerfdet.py b/projects/NeRF-Det/nerfdet/nerfdet.py
new file mode 100644
index 0000000000..ee66387cb5
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/nerfdet.py
@@ -0,0 +1,632 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet3d.models.detectors import Base3DDetector
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import ConfigType, OptConfigType
+from .nerf_utils.nerf_mlp import VanillaNeRF
+from .nerf_utils.projection import Projector
+from .nerf_utils.render_ray import render_rays
+
+# from ..utils.nerf_utils.save_rendered_img import save_rendered_img
+
+
+@MODELS.register_module()
+class NerfDet(Base3DDetector):
+    r"""`ImVoxelNet <https://arxiv.org/abs/2307.14620>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        neck_3d(:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head(:obj:`ConfigDict` or dict): The bbox head config.
+        prior_generator (:obj:`ConfigDict` or dict): The prior generator
+            config.
+        n_voxels (list): Number of voxels along x, y, z axis.
+        voxel_size (list): The size of voxels.Each voxel represents
+            a cube of `voxel_size[0]` meters, `voxel_size[1]` meters,
+            ``
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+        render_testing (bool): If you want to render novel view, please set
+            "render_testing = True" in config
+        The other args are the parameters of NeRF, you can just use the
+            default values.
+    """
+
+    def __init__(
+            self,
+            backbone: ConfigType,
+            neck: ConfigType,
+            neck_3d: ConfigType,
+            bbox_head: ConfigType,
+            prior_generator: ConfigType,
+            n_voxels: List,
+            voxel_size: List,
+            head_2d: ConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            data_preprocessor: OptConfigType = None,
+            init_cfg: OptConfigType = None,
+            #  pretrained,
+            aabb: Tuple = None,
+            near_far_range: List = None,
+            N_samples: int = 64,
+            N_rand: int = 2048,
+            depth_supervise: bool = False,
+            use_nerf_mask: bool = True,
+            nerf_sample_view: int = 3,
+            nerf_mode: str = 'volume',
+            squeeze_scale: int = 4,
+            rgb_supervision: bool = True,
+            nerf_density: bool = False,
+            render_testing: bool = False):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck)
+        self.neck_3d = MODELS.build(neck_3d)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.head_2d = MODELS.build(head_2d) if head_2d is not None else None
+        self.n_voxels = n_voxels
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        self.voxel_size = voxel_size
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.aabb = aabb
+        self.near_far_range = near_far_range
+        self.N_samples = N_samples
+        self.N_rand = N_rand
+        self.depth_supervise = depth_supervise
+        self.projector = Projector()
+        self.squeeze_scale = squeeze_scale
+        self.use_nerf_mask = use_nerf_mask
+        self.rgb_supervision = rgb_supervision
+        nerf_feature_dim = neck['out_channels'] // squeeze_scale
+        self.nerf_mlp = VanillaNeRF(
+            net_depth=4,  # The depth of the MLP
+            net_width=256,  # The width of the MLP
+            skip_layer=3,  # The layer to add skip layers to.
+            feature_dim=nerf_feature_dim + 6,  # + RGB original imgs
+            net_depth_condition=1,  # The depth of the second part of MLP
+            net_width_condition=128)
+        self.nerf_mode = nerf_mode
+        self.nerf_density = nerf_density
+        self.nerf_sample_view = nerf_sample_view
+        self.render_testing = render_testing
+
+        # hard code here, will deal with batch issue later.
+        self.cov = nn.Sequential(
+            nn.Conv3d(
+                neck['out_channels'],
+                neck['out_channels'],
+                kernel_size=3,
+                padding=1), nn.ReLU(inplace=True),
+            nn.Conv3d(
+                neck['out_channels'],
+                neck['out_channels'],
+                kernel_size=3,
+                padding=1), nn.ReLU(inplace=True),
+            nn.Conv3d(neck['out_channels'], 1, kernel_size=1))
+
+        self.mean_mapping = nn.Sequential(
+            nn.Conv3d(
+                neck['out_channels'], nerf_feature_dim // 2, kernel_size=1))
+
+        self.cov_mapping = nn.Sequential(
+            nn.Conv3d(
+                neck['out_channels'], nerf_feature_dim // 2, kernel_size=1))
+
+        self.mapping = nn.Sequential(
+            nn.Linear(neck['out_channels'], nerf_feature_dim // 2))
+
+        self.mapping_2d = nn.Sequential(
+            nn.Conv2d(
+                neck['out_channels'], nerf_feature_dim // 2, kernel_size=1))
+        # self.overfit_nerfmlp = overfit_nerfmlp
+        # if self.overfit_nerfmlp:
+        #     self. _finetuning_NeRF_MLP()
+        self.render_testing = render_testing
+
+    def extract_feat(self,
+                     batch_inputs_dict: dict,
+                     batch_data_samples: SampleList,
+                     mode,
+                     depth=None,
+                     ray_batch=None):
+        """Extract 3d features from the backbone -> fpn -> 3d projection.
+
+        -> 3d neck -> bbox_head.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instances` of `gt_panoptic_seg` or `gt_sem_seg`
+
+        Returns:
+            Tuple:
+            - torch.Tensor: Features of shape (N, C_out, N_x, N_y, N_z).
+            - torch.Tensor: Valid mask of shape (N, 1, N_x, N_y, N_z).
+            - torch.Tensor: 2D features if needed.
+            - dict: The nerf rendered information including the
+                'output_coarse', 'gt_rgb' and 'gt_depth' keys.
+        """
+        img = batch_inputs_dict['imgs']
+        img = img.float()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_size = img.shape[0]
+
+        if len(img.shape) > 4:
+            img = img.reshape([-1] + list(img.shape)[2:])
+            x = self.backbone(img)
+            x = self.neck(x)[0]
+            x = x.reshape([batch_size, -1] + list(x.shape[1:]))
+        else:
+            x = self.backbone(img)
+            x = self.neck(x)[0]
+
+        if depth is not None:
+            depth_bs = depth.shape[0]
+            assert depth_bs == batch_size
+            depth = batch_inputs_dict['depth']
+            depth = depth.reshape([-1] + list(depth.shape)[2:])
+
+        features_2d = self.head_2d.forward(x[-1], batch_img_metas) \
+            if self.head_2d is not None else None
+
+        stride = img.shape[-1] / x.shape[-1]
+        assert stride == 4
+        stride = int(stride)
+
+        volumes, valids = [], []
+        rgb_preds = []
+
+        for feature, img_meta in zip(x, batch_img_metas):
+            angles = features_2d[
+                0] if features_2d is not None and mode == 'test' else None
+            projection = self._compute_projection(img_meta, stride,
+                                                  angles).to(x.device)
+            points = get_points(
+                n_voxels=torch.tensor(self.n_voxels),
+                voxel_size=torch.tensor(self.voxel_size),
+                origin=torch.tensor(img_meta['lidar2img']['origin'])).to(
+                    x.device)
+
+            height = img_meta['img_shape'][0] // stride
+            width = img_meta['img_shape'][1] // stride
+            # Construct the volume space
+            # volume together with valid is the constructed scene
+            # volume represents V_i and valid represents M_p
+            volume, valid = backproject(feature[:, :, :height, :width], points,
+                                        projection, depth, self.voxel_size)
+            density = None
+            volume_sum = volume.sum(dim=0)
+            # cov_valid = valid.clone().detach()
+            valid = valid.sum(dim=0)
+            volume_mean = volume_sum / (valid + 1e-8)
+            volume_mean[:, valid[0] == 0] = .0
+            # volume_cov = (volume - volume_mean.unsqueeze(0)) ** 2 * cov_valid
+            # volume_cov = torch.sum(volume_cov, dim=0) / (valid + 1e-8)
+            volume_cov = torch.sum(
+                (volume - volume_mean.unsqueeze(0))**2, dim=0) / (
+                    valid + 1e-8)
+            volume_cov[:, valid[0] == 0] = 1e6
+            volume_cov = torch.exp(-volume_cov)  # default setting
+            # be careful here, the smaller the cov, the larger the weight.
+            n_channels, n_x_voxels, n_y_voxels, n_z_voxels = volume_mean.shape
+            if ray_batch is not None:
+                if self.nerf_mode == 'volume':
+                    mean_volume = self.mean_mapping(volume_mean.unsqueeze(0))
+                    cov_volume = self.cov_mapping(volume_cov.unsqueeze(0))
+                    feature_2d = feature[:, :, :height, :width]
+
+                elif self.nerf_mode == 'image':
+                    mean_volume = None
+                    cov_volume = None
+                    feature_2d = feature[:, :, :height, :width]
+                    n_v, C, height, width = feature_2d.shape
+                    feature_2d = feature_2d.view(n_v, C,
+                                                 -1).permute(0, 2,
+                                                             1).contiguous()
+                    feature_2d = self.mapping(feature_2d).permute(
+                        0, 2, 1).contiguous().view(n_v, -1, height, width)
+
+                denorm_images = ray_batch['denorm_images']
+                denorm_images = denorm_images.reshape(
+                    [-1] + list(denorm_images.shape)[2:])
+                rgb_projection = self._compute_projection(
+                    img_meta, stride=1, angles=None).to(x.device)
+
+                rgb_volume, _ = backproject(
+                    denorm_images[:, :, :img_meta['img_shape'][0], :
+                                  img_meta['img_shape'][1]], points,
+                    rgb_projection, depth, self.voxel_size)
+
+                ret = render_rays(
+                    ray_batch,
+                    mean_volume,
+                    cov_volume,
+                    feature_2d,
+                    denorm_images,
+                    self.aabb,
+                    self.near_far_range,
+                    self.N_samples,
+                    self.N_rand,
+                    self.nerf_mlp,
+                    img_meta,
+                    self.projector,
+                    self.nerf_mode,
+                    self.nerf_sample_view,
+                    is_train=True if mode == 'train' else False,
+                    render_testing=self.render_testing)
+                rgb_preds.append(ret)
+
+                if self.nerf_density:
+                    # would have 0 bias issue for mean_mapping.
+                    n_v, C, n_x_voxels, n_y_voxels, n_z_voxels = volume.shape
+                    volume = volume.view(n_v, C, -1).permute(0, 2,
+                                                             1).contiguous()
+                    mapping_volume = self.mapping(volume).permute(
+                        0, 2, 1).contiguous().view(n_v, -1, n_x_voxels,
+                                                   n_y_voxels, n_z_voxels)
+
+                    mapping_volume = torch.cat([rgb_volume, mapping_volume],
+                                               dim=1)
+                    mapping_volume_sum = mapping_volume.sum(dim=0)
+                    mapping_volume_mean = mapping_volume_sum / (valid + 1e-8)
+                    # mapping_volume_cov = (
+                    #         mapping_volume - mapping_volume_mean.unsqueeze(0)
+                    #     ) ** 2 * cov_valid
+                    mapping_volume_cov = (mapping_volume -
+                                          mapping_volume_mean.unsqueeze(0))**2
+                    mapping_volume_cov = torch.sum(
+                        mapping_volume_cov, dim=0) / (
+                            valid + 1e-8)
+                    mapping_volume_cov[:, valid[0] == 0] = 1e6
+                    mapping_volume_cov = torch.exp(
+                        -mapping_volume_cov)  # default setting
+                    global_volume = torch.cat(
+                        [mapping_volume_mean, mapping_volume_cov], dim=1)
+                    global_volume = global_volume.view(
+                        -1, n_x_voxels * n_y_voxels * n_z_voxels).permute(
+                            1, 0).contiguous()
+                    points = points.view(3, -1).permute(1, 0).contiguous()
+                    density = self.nerf_mlp.query_density(
+                        points, global_volume)
+                    alpha = 1 - torch.exp(-density)
+                    # density -> alpha
+                    # (1, n_x_voxels, n_y_voxels, n_z_voxels)
+                volume = alpha.view(1, n_x_voxels, n_y_voxels,
+                                    n_z_voxels) * volume_mean
+                volume[:, valid[0] == 0] = .0
+
+            volumes.append(volume)
+            valids.append(valid)
+        x = torch.stack(volumes)
+        x = self.neck_3d(x)
+
+        return x, torch.stack(valids).float(), features_2d, rgb_preds
+
+    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+             **kwargs) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj: `DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        ray_batchs = {}
+        batch_images = []
+        batch_depths = []
+        if 'images' in batch_data_samples[0].gt_nerf_images:
+            for data_samples in batch_data_samples:
+                image = data_samples.gt_nerf_images['images']
+                batch_images.append(image)
+        batch_images = torch.stack(batch_images)
+
+        if 'depths' in batch_data_samples[0].gt_nerf_depths:
+            for data_samples in batch_data_samples:
+                depth = data_samples.gt_nerf_depths['depths']
+                batch_depths.append(depth)
+        batch_depths = torch.stack(batch_depths)
+
+        if 'raydirs' in batch_inputs_dict.keys():
+            ray_batchs['ray_o'] = batch_inputs_dict['lightpos']
+            ray_batchs['ray_d'] = batch_inputs_dict['raydirs']
+            ray_batchs['gt_rgb'] = batch_images
+            ray_batchs['gt_depth'] = batch_depths
+            ray_batchs['nerf_sizes'] = batch_inputs_dict['nerf_sizes']
+            ray_batchs['denorm_images'] = batch_inputs_dict['denorm_images']
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict,
+                batch_data_samples,
+                'train',
+                depth=None,
+                ray_batch=ray_batchs)
+        else:
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict, batch_data_samples, 'train')
+        x += (valids, )
+        losses = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+
+        # if self.head_2d is not None:
+        #     losses.update(
+        #         self.head_2d.loss(*features_2d, batch_data_samples)
+        #     )
+        if len(ray_batchs) != 0 and self.rgb_supervision:
+            losses.update(self.nvs_loss_func(rgb_preds))
+        if self.depth_supervise:
+            losses.update(self.depth_loss_func(rgb_preds))
+        return losses
+
+    def nvs_loss_func(self, rgb_pred):
+        loss = 0
+        for ret in rgb_pred:
+            rgb = ret['outputs_coarse']['rgb']
+            gt = ret['gt_rgb']
+            masks = ret['outputs_coarse']['mask']
+            if self.use_nerf_mask:
+                loss += torch.sum(masks.unsqueeze(-1) * (rgb - gt)**2) / (
+                    masks.sum() + 1e-6)
+            else:
+                loss += torch.mean((rgb - gt)**2)
+        return dict(loss_nvs=loss)
+
+    def depth_loss_func(self, rgb_pred):
+        loss = 0
+        for ret in rgb_pred:
+            depth = ret['outputs_coarse']['depth']
+            gt = ret['gt_depth'].squeeze(-1)
+            masks = ret['outputs_coarse']['mask']
+            if self.use_nerf_mask:
+                loss += torch.sum(masks * torch.abs(depth - gt)) / (
+                    masks.sum() + 1e-6)
+            else:
+                loss += torch.mean(torch.abs(depth - gt))
+
+        return dict(loss_depth=loss)
+
+    def predict(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`NeRFDet3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`NeRFDet3DDataSample`]: Detection results of the
+            input images. Each NeRFDet3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                    (num_instances, C) where C = 6.
+        """
+        ray_batchs = {}
+        batch_images = []
+        batch_depths = []
+        if 'images' in batch_data_samples[0].gt_nerf_images:
+            for data_samples in batch_data_samples:
+                image = data_samples.gt_nerf_images['images']
+                batch_images.append(image)
+        batch_images = torch.stack(batch_images)
+
+        if 'depths' in batch_data_samples[0].gt_nerf_depths:
+            for data_samples in batch_data_samples:
+                depth = data_samples.gt_nerf_depths['depths']
+                batch_depths.append(depth)
+        batch_depths = torch.stack(batch_depths)
+
+        if 'raydirs' in batch_inputs_dict.keys():
+            ray_batchs['ray_o'] = batch_inputs_dict['lightpos']
+            ray_batchs['ray_d'] = batch_inputs_dict['raydirs']
+            ray_batchs['gt_rgb'] = batch_images
+            ray_batchs['gt_depth'] = batch_depths
+            ray_batchs['nerf_sizes'] = batch_inputs_dict['nerf_sizes']
+            ray_batchs['denorm_images'] = batch_inputs_dict['denorm_images']
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict,
+                batch_data_samples,
+                'test',
+                depth=None,
+                ray_batch=ray_batchs)
+        else:
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict, batch_data_samples, 'test')
+        x += (valids, )
+        results_list = self.bbox_head.predict(x, batch_data_samples, **kwargs)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
+        return predictions
+
+    def _forward(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                 *args, **kwargs) -> Tuple[List[torch.Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`NeRFDet3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`
+
+        Returns:
+            tuple[list]: A tuple of features from ``bbox_head`` forward
+        """
+        ray_batchs = {}
+        batch_images = []
+        batch_depths = []
+        if 'images' in batch_data_samples[0].gt_nerf_images:
+            for data_samples in batch_data_samples:
+                image = data_samples.gt_nerf_images['images']
+                batch_images.append(image)
+        batch_images = torch.stack(batch_images)
+
+        if 'depths' in batch_data_samples[0].gt_nerf_depths:
+            for data_samples in batch_data_samples:
+                depth = data_samples.gt_nerf_depths['depths']
+                batch_depths.append(depth)
+        batch_depths = torch.stack(batch_depths)
+        if 'raydirs' in batch_inputs_dict.keys():
+            ray_batchs['ray_o'] = batch_inputs_dict['lightpos']
+            ray_batchs['ray_d'] = batch_inputs_dict['raydirs']
+            ray_batchs['gt_rgb'] = batch_images
+            ray_batchs['gt_depth'] = batch_depths
+            ray_batchs['nerf_sizes'] = batch_inputs_dict['nerf_sizes']
+            ray_batchs['denorm_images'] = batch_inputs_dict['denorm_images']
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict,
+                batch_data_samples,
+                'train',
+                depth=None,
+                ray_batch=ray_batchs)
+        else:
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict, batch_data_samples, 'train')
+        x += (valids, )
+        results = self.bbox_head.forward(x)
+        return results
+
+    def aug_test(self, batch_inputs_dict, batch_data_samples):
+        pass
+
+    def show_results(self, *args, **kwargs):
+        pass
+
+    @staticmethod
+    def _compute_projection(img_meta, stride, angles):
+        projection = []
+        intrinsic = torch.tensor(img_meta['lidar2img']['intrinsic'][:3, :3])
+        ratio = img_meta['ori_shape'][0] / (img_meta['img_shape'][0] / stride)
+        intrinsic[:2] /= ratio
+        # use predict pitch and roll for SUNRGBDTotal test
+        if angles is not None:
+            extrinsics = []
+            for angle in angles:
+                extrinsics.append(get_extrinsics(angle).to(intrinsic.device))
+        else:
+            extrinsics = map(torch.tensor, img_meta['lidar2img']['extrinsic'])
+        for extrinsic in extrinsics:
+            projection.append(intrinsic @ extrinsic[:3])
+        return torch.stack(projection)
+
+
+@torch.no_grad()
+def get_points(n_voxels, voxel_size, origin):
+    # origin: point-cloud center.
+    points = torch.stack(
+        torch.meshgrid([
+            torch.arange(n_voxels[0]),  # 40 W width, x
+            torch.arange(n_voxels[1]),  # 40 D depth, y
+            torch.arange(n_voxels[2])  # 16 H Height, z
+        ]))
+    new_origin = origin - n_voxels / 2. * voxel_size
+    points = points * voxel_size.view(3, 1, 1, 1) + new_origin.view(3, 1, 1, 1)
+    return points
+
+
+# modify from https://github.com/magicleap/Atlas/blob/master/atlas/model.py
+def backproject(features, points, projection, depth, voxel_size):
+    n_images, n_channels, height, width = features.shape
+    n_x_voxels, n_y_voxels, n_z_voxels = points.shape[-3:]
+    points = points.view(1, 3, -1).expand(n_images, 3, -1)
+    points = torch.cat((points, torch.ones_like(points[:, :1])), dim=1)
+    points_2d_3 = torch.bmm(projection, points)
+
+    x = (points_2d_3[:, 0] / points_2d_3[:, 2]).round().long()
+    y = (points_2d_3[:, 1] / points_2d_3[:, 2]).round().long()
+    z = points_2d_3[:, 2]
+    valid = (x >= 0) & (y >= 0) & (x < width) & (y < height) & (z > 0)
+    # below is using depth to sample feature
+    if depth is not None:
+        depth = F.interpolate(
+            depth.unsqueeze(1), size=(height, width),
+            mode='bilinear').squeeze(1)
+        for i in range(n_images):
+            z_mask = z.clone() > 0
+            z_mask[i, valid[i]] = \
+                (z[i, valid[i]] > depth[i, y[i, valid[i]], x[i, valid[i]]] - voxel_size[-1]) & \
+                (z[i, valid[i]] < depth[i, y[i, valid[i]], x[i, valid[i]]] + voxel_size[-1]) # noqa
+            valid = valid & z_mask
+
+    volume = torch.zeros((n_images, n_channels, points.shape[-1]),
+                         device=features.device)
+    for i in range(n_images):
+        volume[i, :, valid[i]] = features[i, :, y[i, valid[i]], x[i, valid[i]]]
+    volume = volume.view(n_images, n_channels, n_x_voxels, n_y_voxels,
+                         n_z_voxels)
+    valid = valid.view(n_images, 1, n_x_voxels, n_y_voxels, n_z_voxels)
+
+    return volume, valid
+
+
+# for SUNRGBDTotal test
+def get_extrinsics(angles):
+    yaw = angles.new_zeros(())
+    pitch, roll = angles
+    r = angles.new_zeros((3, 3))
+    r[0, 0] = torch.cos(yaw) * torch.cos(pitch)
+    r[0, 1] = torch.sin(yaw) * torch.sin(roll) - torch.cos(yaw) * torch.cos(
+        roll) * torch.sin(pitch)
+    r[0, 2] = torch.cos(roll) * torch.sin(yaw) + torch.cos(yaw) * torch.sin(
+        pitch) * torch.sin(roll)
+    r[1, 0] = torch.sin(pitch)
+    r[1, 1] = torch.cos(pitch) * torch.cos(roll)
+    r[1, 2] = -torch.cos(pitch) * torch.sin(roll)
+    r[2, 0] = -torch.cos(pitch) * torch.sin(yaw)
+    r[2, 1] = torch.cos(yaw) * torch.sin(roll) + torch.cos(roll) * torch.sin(
+        yaw) * torch.sin(pitch)
+    r[2, 2] = torch.cos(yaw) * torch.cos(roll) - torch.sin(yaw) * torch.sin(
+        pitch) * torch.sin(roll)
+
+    # follow Total3DUnderstanding
+    t = angles.new_tensor([[0., 0., 1.], [0., -1., 0.], [-1., 0., 0.]])
+    r = t @ r.T
+    # follow DepthInstance3DBoxes
+    r = r[:, [2, 0, 1]]
+    r[2] *= -1
+    extrinsic = angles.new_zeros((4, 4))
+    extrinsic[:3, :3] = r
+    extrinsic[3, 3] = 1.
+    return extrinsic
diff --git a/projects/NeRF-Det/nerfdet/nerfdet_head.py b/projects/NeRF-Det/nerfdet/nerfdet_head.py
new file mode 100644
index 0000000000..d5faa0adc1
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/nerfdet_head.py
@@ -0,0 +1,629 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.cnn import Scale
+# from mmcv.ops import nms3d, nms3d_normal
+from mmdet.models.utils import multi_apply
+from mmdet.utils import reduce_mean
+# from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+# from mmdet3d.structures.bbox_3d.utils import rotation_3d_in_axis
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils.typing_utils import (ConfigType, InstanceList,
+                                        OptConfigType, OptInstanceList)
+
+
+@torch.no_grad()
+def get_points(n_voxels, voxel_size, origin):
+    # origin: point-cloud center.
+    points = torch.stack(
+        torch.meshgrid([
+            torch.arange(n_voxels[0]),  # 40 W width, x
+            torch.arange(n_voxels[1]),  # 40 D depth, y
+            torch.arange(n_voxels[2])  # 16 H Height, z
+        ]))
+    new_origin = origin - n_voxels / 2. * voxel_size
+    points = points * voxel_size.view(3, 1, 1, 1) + new_origin.view(3, 1, 1, 1)
+    return points
+
+
+@MODELS.register_module()
+class NerfDetHead(BaseModule):
+    r"""`ImVoxelNet<https://arxiv.org/abs/2106.01178>`_ head for indoor
+    datasets.
+
+    Args:
+        n_classes (int): Number of classes.
+        n_levels (int): Number of feature levels.
+        n_channels (int): Number of channels in input tensors.
+        n_reg_outs (int): Number of regression layer channels.
+        pts_assign_threshold (int): Min number of location per box to
+            be assigned with.
+        pts_center_threshold (int): Max number of locations per box to
+            be assigned with.
+        center_loss (dict, optional): Config of centerness loss.
+            Default: dict(type='CrossEntropyLoss', use_sigmoid=True).
+        bbox_loss (dict, optional): Config of bbox loss.
+            Default: dict(type='RotatedIoU3DLoss').
+        cls_loss (dict, optional): Config of classification loss.
+            Default: dict(type='FocalLoss').
+        train_cfg (dict, optional): Config for train stage. Defaults to None.
+        test_cfg (dict, optional): Config for test stage. Defaults to None.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 n_classes: int,
+                 n_levels: int,
+                 n_channels: int,
+                 n_reg_outs: int,
+                 pts_assign_threshold: int,
+                 pts_center_threshold: int,
+                 prior_generator: ConfigType,
+                 center_loss: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+                 bbox_loss: ConfigType = dict(type='RotatedIoU3DLoss'),
+                 cls_loss: ConfigType = dict(type='mmdet.FocalLoss'),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super(NerfDetHead, self).__init__(init_cfg)
+        self.n_classes = n_classes
+        self.n_levels = n_levels
+        self.n_reg_outs = n_reg_outs
+        self.pts_assign_threshold = pts_assign_threshold
+        self.pts_center_threshold = pts_center_threshold
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        self.center_loss = MODELS.build(center_loss)
+        self.bbox_loss = MODELS.build(bbox_loss)
+        self.cls_loss = MODELS.build(cls_loss)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(n_channels, n_reg_outs, n_classes, n_levels)
+
+    def _init_layers(self, n_channels, n_reg_outs, n_classes, n_levels):
+        """Initialize neural network layers of the head."""
+        self.conv_center = nn.Conv3d(n_channels, 1, 3, padding=1, bias=False)
+        self.conv_reg = nn.Conv3d(
+            n_channels, n_reg_outs, 3, padding=1, bias=False)
+        self.conv_cls = nn.Conv3d(n_channels, n_classes, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.) for _ in range(n_levels)])
+
+    def init_weights(self):
+        """Initialize all layer weights."""
+        normal_init(self.conv_center, std=.01)
+        normal_init(self.conv_reg, std=.01)
+        normal_init(self.conv_cls, std=.01, bias=bias_init_with_prob(.01))
+
+    def _forward_single(self, x: Tensor, scale: Scale):
+        """Forward pass per level.
+
+        Args:
+            x (Tensor): Per level 3d neck output tensor.
+            scale (mmcv.cnn.Scale): Per level multiplication weight.
+
+        Returns:
+            tuple[Tensor]: Centerness, bbox and classification predictions.
+        """
+        return (self.conv_center(x), torch.exp(scale(self.conv_reg(x))),
+                self.conv_cls(x))
+
+    def forward(self, x):
+        return multi_apply(self._forward_single, x, self.scales)
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`NeRFDet3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        valid_pred = x[-1]
+        outs = self(x[:-1])
+
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        loss_inputs = outs + (valid_pred, batch_gt_instances_3d,
+                              batch_input_metas, batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(self,
+                     center_preds: List[List[Tensor]],
+                     bbox_preds: List[List[Tensor]],
+                     cls_preds: List[List[Tensor]],
+                     valid_pred: Tensor,
+                     batch_gt_instances_3d: InstanceList,
+                     batch_input_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     **kwargs) -> dict:
+        """Per scene loss function.
+
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+                The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            valid_pred (Tensor): Valid mask prediction for all scenes.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_input_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: Centerness, bbox, and classification loss values.
+        """
+        valid_preds = self._upsample_valid_preds(valid_pred, center_preds)
+        center_losses, bbox_losses, cls_losses = [], [], []
+        for i in range(len(batch_input_metas)):
+            center_loss, bbox_loss, cls_loss = self._loss_by_feat_single(
+                center_preds=[x[i] for x in center_preds],
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                valid_preds=[x[i] for x in valid_preds],
+                input_meta=batch_input_metas[i],
+                gt_bboxes=batch_gt_instances_3d[i].bboxes_3d,
+                gt_labels=batch_gt_instances_3d[i].labels_3d)
+            center_losses.append(center_loss)
+            bbox_losses.append(bbox_loss)
+            cls_losses.append(cls_loss)
+        return dict(
+            center_loss=torch.mean(torch.stack(center_losses)),
+            bbox_loss=torch.mean(torch.stack(bbox_losses)),
+            cls_loss=torch.mean(torch.stack(cls_losses)))
+
+    def _loss_by_feat_single(self, center_preds, bbox_preds, cls_preds,
+                             valid_preds, input_meta, gt_bboxes, gt_labels):
+        featmap_sizes = [featmap.size()[-3:] for featmap in center_preds]
+        points = self._get_points(
+            featmap_sizes=featmap_sizes,
+            origin=input_meta['lidar2img']['origin'],
+            device=gt_bboxes.device)
+        center_targets, bbox_targets, cls_targets = self._get_targets(
+            points, gt_bboxes, gt_labels)
+
+        center_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1) for x in center_preds])
+        bbox_preds = torch.cat([
+            x.permute(1, 2, 3, 0).reshape(-1, x.shape[0]) for x in bbox_preds
+        ])
+        cls_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1, x.shape[0]) for x in cls_preds])
+        valid_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1) for x in valid_preds])
+        points = torch.cat(points)
+
+        # cls loss
+        pos_inds = torch.nonzero(
+            torch.logical_and(cls_targets >= 0, valid_preds)).squeeze(1)
+        n_pos = points.new_tensor(len(pos_inds))
+        n_pos = max(reduce_mean(n_pos), 1.)
+        if torch.any(valid_preds):
+            cls_loss = self.cls_loss(
+                cls_preds[valid_preds],
+                cls_targets[valid_preds],
+                avg_factor=n_pos)
+        else:
+            cls_loss = cls_preds[valid_preds].sum()
+
+        # bbox and centerness losses
+        pos_center_preds = center_preds[pos_inds]
+        pos_bbox_preds = bbox_preds[pos_inds]
+        if len(pos_inds) > 0:
+            pos_center_targets = center_targets[pos_inds]
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_points = points[pos_inds]
+            center_loss = self.center_loss(
+                pos_center_preds, pos_center_targets, avg_factor=n_pos)
+            bbox_loss = self.bbox_loss(
+                self._bbox_pred_to_bbox(pos_points, pos_bbox_preds),
+                pos_bbox_targets,
+                weight=pos_center_targets,
+                avg_factor=pos_center_targets.sum())
+        else:
+            center_loss = pos_center_preds.sum()
+            bbox_loss = pos_bbox_preds.sum()
+        return center_loss, bbox_loss, cls_loss
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`NeRFDet3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_panoptic_seg` and
+                `gt_pts_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 6.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        valid_pred = x[-1]
+        outs = self(x[:-1])
+        predictions = self.predict_by_feat(
+            *outs,
+            valid_pred=valid_pred,
+            batch_input_metas=batch_input_metas,
+            rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self, center_preds: List[List[Tensor]],
+                        bbox_preds: List[List[Tensor]],
+                        cls_preds: List[List[Tensor]], valid_pred: Tensor,
+                        batch_input_metas: List[dict],
+                        **kwargs) -> List[InstanceData]:
+        """Generate boxes for all scenes.
+
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes.
+            valid_pred (Tensor): Valid mask prediction for all scenes.
+            batch_input_metas (list[dict]): Meta infos for all scenes.
+
+        Returns:
+            list[tuple[Tensor]]: Predicted bboxes, scores, and labels for
+                all scenes.
+        """
+        valid_preds = self._upsample_valid_preds(valid_pred, center_preds)
+        results = []
+        for i in range(len(batch_input_metas)):
+            results.append(
+                self._predict_by_feat_single(
+                    center_preds=[x[i] for x in center_preds],
+                    bbox_preds=[x[i] for x in bbox_preds],
+                    cls_preds=[x[i] for x in cls_preds],
+                    valid_preds=[x[i] for x in valid_preds],
+                    input_meta=batch_input_metas[i]))
+        return results
+
+    def _predict_by_feat_single(self, center_preds: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                cls_preds: List[Tensor],
+                                valid_preds: List[Tensor],
+                                input_meta: dict) -> InstanceData:
+        """Generate boxes for single sample.
+
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            valid_preds (tuple[Tensor]): Upsampled valid masks for all feature
+                levels.
+            input_meta (dict): Scene meta info.
+
+        Returns:
+            tuple[Tensor]: Predicted bounding boxes, scores and labels.
+        """
+        featmap_sizes = [featmap.size()[-3:] for featmap in center_preds]
+        points = self._get_points(
+            featmap_sizes=featmap_sizes,
+            origin=input_meta['lidar2img']['origin'],
+            device=center_preds[0].device)
+        mlvl_bboxes, mlvl_scores = [], []
+        for center_pred, bbox_pred, cls_pred, valid_pred, point in zip(
+                center_preds, bbox_preds, cls_preds, valid_preds, points):
+            center_pred = center_pred.permute(1, 2, 3, 0).reshape(-1, 1)
+            bbox_pred = bbox_pred.permute(1, 2, 3,
+                                          0).reshape(-1, bbox_pred.shape[0])
+            cls_pred = cls_pred.permute(1, 2, 3,
+                                        0).reshape(-1, cls_pred.shape[0])
+            valid_pred = valid_pred.permute(1, 2, 3, 0).reshape(-1, 1)
+            scores = cls_pred.sigmoid() * center_pred.sigmoid() * valid_pred
+            max_scores, _ = scores.max(dim=1)
+
+            if len(scores) > self.test_cfg.nms_pre > 0:
+                _, ids = max_scores.topk(self.test_cfg.nms_pre)
+                bbox_pred = bbox_pred[ids]
+                scores = scores[ids]
+                point = point[ids]
+
+            bboxes = self._bbox_pred_to_bbox(point, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+
+        bboxes = torch.cat(mlvl_bboxes)
+        scores = torch.cat(mlvl_scores)
+        bboxes, scores, labels = self._nms(bboxes, scores, input_meta)
+
+        bboxes = input_meta['box_type_3d'](
+            bboxes, box_dim=6, with_yaw=False, origin=(.5, .5, .5))
+
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        return results
+
+    @staticmethod
+    def _upsample_valid_preds(valid_pred, features):
+        """Upsample valid mask predictions.
+
+        Args:
+            valid_pred (Tensor): Valid mask prediction.
+            features (Tensor): Feature tensor.
+
+        Returns:
+            tuple[Tensor]: Upsampled valid masks for all feature levels.
+        """
+        return [
+            nn.Upsample(size=x.shape[-3:],
+                        mode='trilinear')(valid_pred).round().bool()
+            for x in features
+        ]
+
+    @torch.no_grad()
+    def _get_points(self, featmap_sizes, origin, device):
+        mlvl_points = []
+        tmp_voxel_size = [.16, .16, .2]
+        for i, featmap_size in enumerate(featmap_sizes):
+            mlvl_points.append(
+                get_points(
+                    n_voxels=torch.tensor(featmap_size),
+                    voxel_size=torch.tensor(tmp_voxel_size) * (2**i),
+                    origin=torch.tensor(origin)).reshape(3, -1).transpose(
+                        0, 1).to(device))
+        return mlvl_points
+
+    def _bbox_pred_to_bbox(self, points, bbox_pred):
+        return torch.stack([
+            points[:, 0] - bbox_pred[:, 0], points[:, 1] - bbox_pred[:, 2],
+            points[:, 2] - bbox_pred[:, 4], points[:, 0] + bbox_pred[:, 1],
+            points[:, 1] + bbox_pred[:, 3], points[:, 2] + bbox_pred[:, 5]
+        ], -1)
+
+    def _bbox_pred_to_loss(self, points, bbox_preds):
+        return self._bbox_pred_to_bbox(points, bbox_preds)
+
+    # The function is directly copied from FCAF3DHead.
+    @staticmethod
+    def _get_face_distances(points, boxes):
+        """Calculate distances from point to box faces.
+
+        Args:
+            points (Tensor): Final locations of shape (N_points, N_boxes, 3).
+            boxes (Tensor): 3D boxes of shape (N_points, N_boxes, 7)
+
+        Returns:
+            Tensor: Face distances of shape (N_points, N_boxes, 6),
+                (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+        """
+        dx_min = points[..., 0] - boxes[..., 0] + boxes[..., 3] / 2
+        dx_max = boxes[..., 0] + boxes[..., 3] / 2 - points[..., 0]
+        dy_min = points[..., 1] - boxes[..., 1] + boxes[..., 4] / 2
+        dy_max = boxes[..., 1] + boxes[..., 4] / 2 - points[..., 1]
+        dz_min = points[..., 2] - boxes[..., 2] + boxes[..., 5] / 2
+        dz_max = boxes[..., 2] + boxes[..., 5] / 2 - points[..., 2]
+        return torch.stack((dx_min, dx_max, dy_min, dy_max, dz_min, dz_max),
+                           dim=-1)
+
+    @staticmethod
+    def _get_centerness(face_distances):
+        """Compute point centerness w.r.t containing box.
+
+        Args:
+            face_distances (Tensor): Face distances of shape (B, N, 6),
+                (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+
+        Returns:
+            Tensor: Centerness of shape (B, N).
+        """
+        x_dims = face_distances[..., [0, 1]]
+        y_dims = face_distances[..., [2, 3]]
+        z_dims = face_distances[..., [4, 5]]
+        centerness_targets = x_dims.min(dim=-1)[0] / x_dims.max(dim=-1)[0] * \
+            y_dims.min(dim=-1)[0] / y_dims.max(dim=-1)[0] * \
+            z_dims.min(dim=-1)[0] / z_dims.max(dim=-1)[0]
+        return torch.sqrt(centerness_targets)
+
+    @torch.no_grad()
+    def _get_targets(self, points, gt_bboxes, gt_labels):
+        """Compute targets for final locations for a single scene.
+
+        Args:
+            points (list[Tensor]): Final locations for all levels.
+            gt_bboxes (BaseInstance3DBoxes): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+
+        Returns:
+            tuple[Tensor]: Centerness, bbox and classification
+                targets for all locations.
+        """
+        float_max = 1e8
+        expanded_scales = [
+            points[i].new_tensor(i).expand(len(points[i])).to(gt_labels.device)
+            for i in range(len(points))
+        ]
+        points = torch.cat(points, dim=0).to(gt_labels.device)
+        scales = torch.cat(expanded_scales, dim=0)
+
+        # below is based on FCOSHead._get_target_single
+        n_points = len(points)
+        n_boxes = len(gt_bboxes)
+        volumes = gt_bboxes.volume.to(points.device)
+        volumes = volumes.expand(n_points, n_boxes).contiguous()
+        gt_bboxes = torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:6]), dim=1)
+        gt_bboxes = gt_bboxes.to(points.device).expand(n_points, n_boxes, 6)
+        expanded_points = points.unsqueeze(1).expand(n_points, n_boxes, 3)
+        bbox_targets = self._get_face_distances(expanded_points, gt_bboxes)
+
+        # condition1: inside a gt bbox
+        inside_gt_bbox_mask = bbox_targets[..., :6].min(
+            -1)[0] > 0  # skip angle
+
+        # condition2: positive points per scale >= limit
+        # calculate positive points per scale
+        n_pos_points_per_scale = []
+        for i in range(self.n_levels):
+            n_pos_points_per_scale.append(
+                torch.sum(inside_gt_bbox_mask[scales == i], dim=0))
+        # find best scale
+        n_pos_points_per_scale = torch.stack(n_pos_points_per_scale, dim=0)
+        lower_limit_mask = n_pos_points_per_scale < self.pts_assign_threshold
+        # fix nondeterministic argmax for torch<1.7
+        extra = torch.arange(self.n_levels, 0, -1).unsqueeze(1).expand(
+            self.n_levels, n_boxes).to(lower_limit_mask.device)
+        lower_index = torch.argmax(lower_limit_mask.int() * extra, dim=0) - 1
+        lower_index = torch.where(lower_index < 0,
+                                  torch.zeros_like(lower_index), lower_index)
+        all_upper_limit_mask = torch.all(
+            torch.logical_not(lower_limit_mask), dim=0)
+        best_scale = torch.where(
+            all_upper_limit_mask,
+            torch.ones_like(all_upper_limit_mask) * self.n_levels - 1,
+            lower_index)
+        # keep only points with best scale
+        best_scale = torch.unsqueeze(best_scale, 0).expand(n_points, n_boxes)
+        scales = torch.unsqueeze(scales, 1).expand(n_points, n_boxes)
+        inside_best_scale_mask = best_scale == scales
+
+        # condition3: limit topk locations per box by centerness
+        centerness = self._get_centerness(bbox_targets)
+        centerness = torch.where(inside_gt_bbox_mask, centerness,
+                                 torch.ones_like(centerness) * -1)
+        centerness = torch.where(inside_best_scale_mask, centerness,
+                                 torch.ones_like(centerness) * -1)
+        top_centerness = torch.topk(
+            centerness, self.pts_center_threshold + 1, dim=0).values[-1]
+        inside_top_centerness_mask = centerness > top_centerness.unsqueeze(0)
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        volumes = torch.where(inside_gt_bbox_mask, volumes,
+                              torch.ones_like(volumes) * float_max)
+        volumes = torch.where(inside_best_scale_mask, volumes,
+                              torch.ones_like(volumes) * float_max)
+        volumes = torch.where(inside_top_centerness_mask, volumes,
+                              torch.ones_like(volumes) * float_max)
+        min_area, min_area_inds = volumes.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels = torch.where(min_area == float_max,
+                             torch.ones_like(labels) * -1, labels)
+        bbox_targets = bbox_targets[range(n_points), min_area_inds]
+        centerness_targets = self._get_centerness(bbox_targets)
+
+        return centerness_targets, self._bbox_pred_to_bbox(
+            points, bbox_targets), labels
+
+    def _nms(self, bboxes, scores, img_meta):
+        scores, labels = scores.max(dim=1)
+        ids = scores > self.test_cfg.score_thr
+        bboxes = bboxes[ids]
+        scores = scores[ids]
+        labels = labels[ids]
+        ids = self.aligned_3d_nms(bboxes, scores, labels,
+                                  self.test_cfg.iou_thr)
+        bboxes = bboxes[ids]
+        bboxes = torch.stack(
+            ((bboxes[:, 0] + bboxes[:, 3]) / 2.,
+             (bboxes[:, 1] + bboxes[:, 4]) / 2.,
+             (bboxes[:, 2] + bboxes[:, 5]) / 2., bboxes[:, 3] - bboxes[:, 0],
+             bboxes[:, 4] - bboxes[:, 1], bboxes[:, 5] - bboxes[:, 2]),
+            dim=1)
+        return bboxes, scores[ids], labels[ids]
+
+    @staticmethod
+    def aligned_3d_nms(boxes, scores, classes, thresh):
+        """3d nms for aligned boxes.
+
+        Args:
+            boxes (torch.Tensor): Aligned box with shape [n, 6].
+            scores (torch.Tensor): Scores of each box.
+            classes (torch.Tensor): Class of each box.
+            thresh (float): Iou threshold for nms.
+
+        Returns:
+            torch.Tensor: Indices of selected boxes.
+        """
+        x1 = boxes[:, 0]
+        y1 = boxes[:, 1]
+        z1 = boxes[:, 2]
+        x2 = boxes[:, 3]
+        y2 = boxes[:, 4]
+        z2 = boxes[:, 5]
+        area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+        zero = boxes.new_zeros(1, )
+
+        score_sorted = torch.argsort(scores)
+        pick = []
+        while (score_sorted.shape[0] != 0):
+            last = score_sorted.shape[0]
+            i = score_sorted[-1]
+            pick.append(i)
+
+            xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+            yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+            zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+            xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+            yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+            zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+            classes1 = classes[i]
+            classes2 = classes[score_sorted[:last - 1]]
+            inter_l = torch.max(zero, xx2 - xx1)
+            inter_w = torch.max(zero, yy2 - yy1)
+            inter_h = torch.max(zero, zz2 - zz1)
+
+            inter = inter_l * inter_w * inter_h
+            iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+            iou = iou * (classes1 == classes2).float()
+            score_sorted = score_sorted[torch.nonzero(
+                iou <= thresh, as_tuple=False).flatten()]
+
+        indices = boxes.new_tensor(pick, dtype=torch.long)
+        return indices
diff --git a/projects/NeRF-Det/nerfdet/scannet_multiview_dataset.py b/projects/NeRF-Det/nerfdet/scannet_multiview_dataset.py
new file mode 100644
index 0000000000..a20bc3eec0
--- /dev/null
+++ b/projects/NeRF-Det/nerfdet/scannet_multiview_dataset.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from os import path as osp
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+
+from mmdet3d.datasets import Det3DDataset
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import DepthInstance3DBoxes
+
+
+@DATASETS.register_module()
+class MultiViewScanNetDataset(Det3DDataset):
+    r"""Multi-View ScanNet Dataset for NeRF-detection Task
+
+    This class serves as the API for experiments on the ScanNet Dataset.
+
+    Please refer to the `github repo <https://github.com/ScanNet/ScanNet>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=True, use_lidar=False).
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes':
+        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin')
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=True, use_lidar=False),
+                 box_type_3d: str = 'Depth',
+                 filter_empty_gt: bool = True,
+                 remove_dontcare: bool = False,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        self.remove_dontcare = remove_dontcare
+
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            pipeline=pipeline,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+        assert 'use_camera' in self.modality and \
+               'use_lidar' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_lidar']
+
+    @staticmethod
+    def _get_axis_align_matrix(info: dict) -> np.ndarray:
+        """Get axis_align_matrix from info. If not exist, return identity mat.
+
+        Args:
+            info (dict): Info of a single sample data.
+
+        Returns:
+            np.ndarray: 4x4 transformation matrix.
+        """
+        if 'axis_align_matrix' in info:
+            return np.array(info['axis_align_matrix'])
+        else:
+            warnings.warn(
+                'axis_align_matrix is not found in ScanNet data info, please '
+                'use new pre-process scripts to re-generate ScanNet data')
+            return np.eye(4).astype(np.float32)
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Convert all relative path of needed modality data file to
+        the absolute path.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        if self.modality['use_depth']:
+            info['depth_info'] = []
+        if self.modality['use_neuralrecon_depth']:
+            info['depth_info'] = []
+
+        if self.modality['use_lidar']:
+            # implement lidar processing in the future
+            raise NotImplementedError(
+                'Please modified '
+                '`MultiViewPipeline` to support lidar processing')
+
+        info['axis_align_matrix'] = self._get_axis_align_matrix(info)
+        info['img_info'] = []
+        info['lidar2img'] = []
+        info['c2w'] = []
+        info['camrotc2w'] = []
+        info['lightpos'] = []
+        # load img and depth_img
+        for i in range(len(info['img_paths'])):
+            img_filename = osp.join(self.data_root, info['img_paths'][i])
+
+            info['img_info'].append(dict(filename=img_filename))
+            if 'depth_info' in info.keys():
+                if self.modality['use_neuralrecon_depth']:
+                    info['depth_info'].append(
+                        dict(filename=img_filename[:-4] + '.npy'))
+                else:
+                    info['depth_info'].append(
+                        dict(filename=img_filename[:-4] + '.png'))
+            # implement lidar_info in input.keys() in the future.
+            extrinsic = np.linalg.inv(
+                info['axis_align_matrix'] @ info['lidar2cam'][i])
+            info['lidar2img'].append(extrinsic.astype(np.float32))
+            if self.modality['use_ray']:
+                c2w = (
+                    info['axis_align_matrix'] @ info['lidar2cam'][i]).astype(
+                        np.float32)  # noqa
+                info['c2w'].append(c2w)
+                info['camrotc2w'].append(c2w[0:3, 0:3])
+                info['lightpos'].append(c2w[0:3, 3])
+        origin = np.array([.0, .0, .5])
+        info['lidar2img'] = dict(
+            extrinsic=info['lidar2img'],
+            intrinsic=info['cam2img'].astype(np.float32),
+            origin=origin.astype(np.float32))
+
+        if self.modality['use_ray']:
+            info['ray_info'] = []
+
+        if not self.test_mode:
+            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            info['ann_info'] = self.parse_ann_info(info)
+            info['eval_ann_info'] = self._remove_dontcare(info['ann_info'])
+
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict: Processed `ann_info`.
+        """
+        ann_info = super().parse_ann_info(info)
+
+        if self.remove_dontcare:
+            ann_info = self._remove_dontcare(ann_info)
+
+        # empty gt
+        if ann_info is None:
+            ann_info = dict()
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
+
+        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
+            ann_info['gt_bboxes_3d'],
+            box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        # count the numbers
+        for label in ann_info['gt_labels_3d']:
+            if label != -1:
+                cat_name = self.metainfo['classes'][label]
+                self.num_ins_per_cat[cat_name] += 1
+
+        return ann_info
diff --git a/projects/NeRF-Det/prepare_infos.py b/projects/NeRF-Det/prepare_infos.py
new file mode 100644
index 0000000000..3e1a13516f
--- /dev/null
+++ b/projects/NeRF-Det/prepare_infos.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Prepare the dataset for NeRF-Det.
+
+Example:
+    python projects/NeRF-Det/prepare_infos.py
+        --root-path ./data/scannet
+        --out-dir ./data/scannet
+"""
+import argparse
+import time
+from os import path as osp
+from pathlib import Path
+
+import mmengine
+
+from ...tools.dataset_converters import indoor_converter as indoor
+from ...tools.dataset_converters.update_infos_to_v2 import (
+    clear_data_info_unused_keys, clear_instance_unused_keys,
+    get_empty_instance, get_empty_standard_data_info)
+
+
+def update_scannet_infos_nerfdet(pkl_path, out_dir):
+    """Update the origin pkl to the new format which will be used in nerf-det.
+
+    Args:
+        pkl_path (str): Path of the origin pkl.
+        out_dir (str): Output directory of the generated info file.
+
+    Returns:
+        The pkl will be overwritTen.
+        The new pkl is a dict containing two keys:
+        metainfo: Some base information of the pkl
+        data_list (list): A list containing all the information of the scenes.
+    """
+    print('The new refactored process is running.')
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    METAINFO = {
+        'classes':
+        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin')
+    }
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for ori_info_dict in mmengine.track_iter_progress(data_list):
+        temp_data_info = get_empty_standard_data_info()
+
+        # intrinsics, extrinsics and imgs
+        temp_data_info['cam2img'] = ori_info_dict['intrinsics']
+        temp_data_info['lidar2cam'] = ori_info_dict['extrinsics']
+        temp_data_info['img_paths'] = ori_info_dict['img_paths']
+
+        # annotation information
+        anns = ori_info_dict.get('annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            temp_data_info['axis_align_matrix'] = anns[
+                'axis_align_matrix'].tolist()
+            if anns['gt_num'] == 0:
+                instance_list = []
+            else:
+                num_instances = len(anns['name'])
+                instance_list = []
+                for instance_id in range(num_instances):
+                    empty_instance = get_empty_instance()
+                    empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
+                        instance_id].tolist()
+
+                    if anns['name'][instance_id] in METAINFO['classes']:
+                        empty_instance['bbox_label_3d'] = METAINFO[
+                            'classes'].index(anns['name'][instance_id])
+                    else:
+                        ignore_class_name.add(anns['name'][instance_id])
+                        empty_instance['bbox_label_3d'] = -1
+
+                    empty_instance = clear_instance_unused_keys(empty_instance)
+                    instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'scannet'
+    metainfo['info_version'] = '1.1'
+
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for scannet dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+        version (str): Only used to generate the dataset of nerfdet now.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
+    info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
+    update_scannet_infos_nerfdet(out_dir=out_dir, pkl_path=info_train_path)
+    update_scannet_infos_nerfdet(out_dir=out_dir, pkl_path=info_val_path)
+    update_scannet_infos_nerfdet(out_dir=out_dir, pkl_path=info_test_path)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/scannet',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/scannet',
+    required=False,
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='scannet')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    from mmdet3d.utils import register_all_modules
+    register_all_modules()
+
+    scannet_data_prep(
+        root_path=args.root_path,
+        info_prefix=args.extra_tag,
+        out_dir=args.out_dir,
+        workers=args.workers)

From 88b86943b550957ffbc838dc63d455c5ead2a1d3 Mon Sep 17 00:00:00 2001
From: Sun Jiahao <72679458+sunjiahao1999@users.noreply.github.com>
Date: Thu, 4 Jan 2024 21:19:29 +0800
Subject: [PATCH 6/7] [Feature] Support PGD and multi-view FCOS3D++ on Waymo 
 (#2835)

Co-authored-by: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Co-authored-by: sjh <sunjiahao1999>
---
 .../datasets/waymoD3-fov-mono3d-3class.py     | 184 ++++++++++++++++
 .../datasets/waymoD3-mv-mono3d-3class.py      | 191 +++++++++++++++++
 .../_base_/datasets/waymoD5-mv3d-3class.py    |  32 ++-
 configs/_base_/models/multiview_dfm.py        |  28 +--
 configs/mvfcos3d/README.md                    |  62 ++++++
 ...fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py} |   0
 ...dcn_centerhead_16xb2_waymoD5-3d-3class.py} |   0
 configs/pgd/README.md                         |  17 ++
 ...1_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py |   6 +-
 ..._gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py | 112 ++++++++++
 ...n_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py | 111 ++++++++++
 docs/en/user_guides/dataset_prepare.md        |  44 ++--
 docs/zh_cn/user_guides/dataset_prepare.md     |  44 ++--
 mmdet3d/models/dense_heads/pgd_head.py        |   2 -
 mmdet3d/models/detectors/dfm.py               |   7 +-
 mmdet3d/models/detectors/multiview_dfm.py     | 196 ++++++++++++++----
 16 files changed, 909 insertions(+), 127 deletions(-)
 create mode 100644 configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
 create mode 100644 configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
 create mode 100644 configs/mvfcos3d/README.md
 rename configs/{dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py => mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py} (100%)
 rename configs/{dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py => mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py} (100%)
 create mode 100644 configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
 create mode 100644 configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py

diff --git a/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py b/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
new file mode 100644
index 0000000000..b0142b9c94
--- /dev/null
+++ b/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
@@ -0,0 +1,184 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(0.95, 1.05),
+        # ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
+    metric='LET_mAP',
+    load_type='fov_image_based',
+    result_prefix='./pgd_fov_pred')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py b/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
new file mode 100644
index 0000000000..af245effa7
--- /dev/null
+++ b/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
@@ -0,0 +1,191 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        # ratio_range=(1., 1.),
+        ratio_range=(0.95, 1.05),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='Resize3D',
+        scale_factor=0.65,
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='Resize3D',
+        scale_factor=0.65,
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    metric='LET_mAP',
+    load_type='mv_image_based',
+    result_prefix='./pgd_mv_pred',
+    nms_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=500,
+        nms_thr=0.05,
+        score_thr=0.001,
+        min_bbox_size=0,
+        max_per_frame=100))
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/configs/_base_/datasets/waymoD5-mv3d-3class.py b/configs/_base_/datasets/waymoD5-mv3d-3class.py
index a9cd619da4..6ea2678993 100644
--- a/configs/_base_/datasets/waymoD5-mv3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-mv3d-3class.py
@@ -1,5 +1,5 @@
 # dataset settings
-# D3 in the config name means the whole dataset is divided into 3 folds
+# D5 in the config name means the whole dataset is divided into 5 folds
 # We only use one fold for efficient experiments
 dataset_type = 'WaymoDataset'
 data_root = 'data/waymo/kitti_format/'
@@ -19,7 +19,7 @@
 #      }))
 backend_args = None
 
-class_names = ['Car', 'Pedestrian', 'Cyclist']
+class_names = ['Pedestrian', 'Cyclist', 'Car']
 input_modality = dict(use_lidar=False, use_camera=True)
 point_cloud_range = [-35.0, -75.0, -2, 75.0, 75.0, 4]
 
@@ -30,7 +30,7 @@
         scale=(1248, 832),
         ratio_range=(0.95, 1.05),
         keep_ratio=True),
-    dict(type='RandomCrop3D', crop_size=(720, 1080)),
+    dict(type='RandomCrop3D', crop_size=(1080, 720)),
     dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5, flip_box3d=False),
 ]
 
@@ -70,7 +70,14 @@
         to_float32=True,
         backend_args=backend_args),
     dict(type='MultiViewWrapper', transforms=test_transforms),
-    dict(type='Pack3DDetInputs', keys=['img'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam',
+            'num_ref_frames', 'num_views'
+        ])
 ]
 # construct a pipeline for data and gt loading in show function
 # please keep its loading function consistent with test_pipeline (e.g. client)
@@ -80,7 +87,14 @@
         to_float32=True,
         backend_args=backend_args),
     dict(type='MultiViewWrapper', transforms=test_transforms),
-    dict(type='Pack3DDetInputs', keys=['img'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam',
+            'num_ref_frames', 'num_views'
+        ])
 ]
 metainfo = dict(classes=class_names)
 
@@ -103,6 +117,7 @@
         pipeline=train_pipeline,
         modality=input_modality,
         test_mode=False,
+        cam_sync_instances=True,
         metainfo=metainfo,
         box_type_3d='Lidar',
         load_interval=5,
@@ -149,7 +164,7 @@
             CAM_FRONT_RIGHT='training/image_2',
             CAM_SIDE_LEFT='training/image_3',
             CAM_SIDE_RIGHT='training/image_4'),
-        pipeline=eval_pipeline,
+        pipeline=test_pipeline,
         modality=input_modality,
         test_mode=True,
         metainfo=metainfo,
@@ -157,10 +172,7 @@
         backend_args=backend_args))
 val_evaluator = dict(
     type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
     waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
-    data_root='./data/waymo/waymo_format',
-    metric='LET_mAP',
-    backend_args=backend_args)
+    metric='LET_mAP')
 
 test_evaluator = val_evaluator
diff --git a/configs/_base_/models/multiview_dfm.py b/configs/_base_/models/multiview_dfm.py
index e6f4d276c7..7fa5376d1e 100644
--- a/configs/_base_/models/multiview_dfm.py
+++ b/configs/_base_/models/multiview_dfm.py
@@ -35,7 +35,7 @@
         type='AlignedAnchor3DRangeGenerator',
         ranges=[[-35.0, -75.0, -2, 75.0, 75.0, 4]],
         rotations=[.0]),
-    bbox_head=dict(
+    bbox_head_3d=dict(
         type='Anchor3DHead',
         num_classes=3,
         in_channels=256,
@@ -43,13 +43,13 @@
         use_direction_classifier=True,
         anchor_generator=dict(
             type='AlignedAnchor3DRangeGenerator',
-            ranges=[[-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345],
-                    [-35.0, -75.0, 0, 75.0, 75.0, 0],
-                    [-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188]],
+            ranges=[[-35.0, -75.0, 0, 75.0, 75.0, 0],
+                    [-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188],
+                    [-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345]],
             sizes=[
-                [4.73, 2.08, 1.77],  # car
                 [0.91, 0.84, 1.74],  # pedestrian
                 [1.81, 0.84, 1.77],  # cyclist
+                [4.73, 2.08, 1.77],  # car
             ],
             rotations=[0, 1.57],
             reshape_out=False),
@@ -69,13 +69,6 @@
             loss_weight=0.2)),
     train_cfg=dict(
         assigner=[
-            dict(  # for Car
-                type='Max3DIoUAssigner',
-                iou_calculator=dict(type='BboxOverlapsNearest3D'),
-                pos_iou_thr=0.6,
-                neg_iou_thr=0.45,
-                min_pos_iou=0.45,
-                ignore_iof_thr=-1),
             dict(  # for Pedestrian
                 type='Max3DIoUAssigner',
                 iou_calculator=dict(type='BboxOverlapsNearest3D'),
@@ -90,6 +83,13 @@
                 neg_iou_thr=0.35,
                 min_pos_iou=0.35,
                 ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1)
         ],
         allowed_border=0,
         pos_weight=-1,
@@ -100,5 +100,5 @@
         nms_thr=0.05,
         score_thr=0.001,
         min_bbox_size=0,
-        nms_pre=500,
-        max_num=100))
+        nms_pre=4096,
+        max_num=500))
diff --git a/configs/mvfcos3d/README.md b/configs/mvfcos3d/README.md
new file mode 100644
index 0000000000..934ec1ad6c
--- /dev/null
+++ b/configs/mvfcos3d/README.md
@@ -0,0 +1,62 @@
+# MV-FCOS3D++: Multi-View Camera-Only 4D Object Detection with Pretrained Monocular Backbones
+
+> [MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones](https://arxiv.org/abs/2207.12716)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this technical report, we present our solution, dubbed MV-FCOS3D++, for the Camera-Only 3D Detection track in Waymo Open Dataset Challenge 2022. For multi-view camera-only 3D detection, methods based on bird-eye-view or 3D geometric representations can leverage the stereo cues from overlapped regions between adjacent views and directly perform 3D detection without hand-crafted post-processing. However, it lacks direct semantic supervision for 2D backbones, which can be complemented by pretraining simple monocular-based detectors. Our solution is a multi-view framework for 4D detection following this paradigm. It is built upon a simple monocular detector FCOS3D++, pretrained only with object annotations of Waymo, and converts multi-view features to a 3D grid space to detect 3D objects thereon. A dual-path neck for single-frame understanding and temporal stereo matching is devised to incorporate multi-frame information. Our method finally achieves 49.75% mAPL with a single model and wins 2nd place in the WOD challenge, without any LiDAR-based depth supervision during training. The code will be released at [this https URL](https://github.com/Tai-Wang/Depth-from-Motion).
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection3d/assets/72679458/9313eb3c-cc41-40be-9ead-549b3b5fef44" width="800"/>
+</div>
+
+## Introduction
+
+We implement multi-view FCOS3D++ and provide the results on Waymo dataset.
+
+## Usage
+
+### Training commands
+
+1. You should train PGD first:
+
+```bash
+bash tools/dist_train.py configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py 8
+```
+
+2. Given pre-trained PGD backbone, you could train multi-view FCOS3D++:
+
+```bash
+bash tools/dist_train.sh configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py --cfg-options load_from=${PRETRAINED_CHECKPOINT}
+```
+
+**Note**:
+the path of `load_from` needs to be changed to yours accordingly.
+
+## Results and models
+
+### Waymo
+
+|                                Backbone                                | Load Interval | mAPL | mAP  | mAPH |                                                                                             Download                                                                                             |
+| :--------------------------------------------------------------------: | :-----------: | :--: | :--: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101+DCN](./multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py) |      5x       | 38.2 | 52.9 | 49.5 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class_20231127_122815.log) |
+|                              above @ Car                               |               | 56.5 | 73.3 | 72.3 |                                                                                                                                                                                                  |
+|                           above @ Pedestrian                           |               | 34.8 | 49.5 | 43.1 |                                                                                                                                                                                                  |
+|                            above @ Cyclist                             |               | 23.2 | 35.9 | 33.3 |                                                                                                                                                                                                  |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
+## Citation
+
+```latex
+@article{wang2022mvfcos3d++,
+  title={{MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones},
+  author={Wang, Tai and Lian, Qing and Zhu, Chenming and Zhu, Xinge and Zhang, Wenwei},
+  journal={arXiv preprint},
+  year={2022}
+}
+```
diff --git a/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py b/configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
similarity index 100%
rename from configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py
rename to configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
diff --git a/configs/dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py b/configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
similarity index 100%
rename from configs/dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
rename to configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
diff --git a/configs/pgd/README.md b/configs/pgd/README.md
index 2237a0f4ef..6c41522b3d 100644
--- a/configs/pgd/README.md
+++ b/configs/pgd/README.md
@@ -50,6 +50,23 @@ Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the
 | [above w/ finetune](./pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py) |   2x    |   9.20   | 35.8 | 42.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135.log.json) |
 |                                   above w/ tta                                    |   2x    |   9.20   | 36.8 | 43.1 |                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 
+### Waymo
+
+|                                   Backbone                                   | Load Interval |  Camera view  | mAPL | mAP  | mAPH  |                                                                                             Download                                                                                              |
+| :--------------------------------------------------------------------------: | :-----------: | :-----------: | :--: | :--: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py) |      3x       | front-of-view | 15.8 | 22.7 | 21.51 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d_20231107_164117.log) |
+|                                 above @ Car                                  |               |               | 36.7 | 51.6 | 51.0  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 9.0  | 14.1 | 11.4  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.6  | 2.5  |  2.2  |                                                                                                                                                                                                   |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py)  |      3x       |  multi-view   | 20.8 | 29.3 | 27.7  |  [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d_20231120_202732.log)  |
+|                                 above @ Car                                  |               |               | 41.2 | 56.1 | 55.2  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 20.0 | 29.6 | 25.8  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.4  | 2.2  |  2.0  |                                                                                                                                                                                                   |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
 ## Citation
 
 ```latex
diff --git a/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py b/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
index 856fd8f604..fa50e0e04f 100644
--- a/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
+++ b/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
@@ -68,9 +68,9 @@
             type='PGDBBoxCoder',
             base_depths=((41.01, 18.44), ),
             base_dims=(
-                (4.73, 1.77, 2.08),
-                (0.91, 1.74, 0.84),
-                (1.81, 1.77, 0.84),
+                (4.73, 1.77, 2.08),  # Car
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
             ),
             code_size=7)),
     # set weight 1.0 for base 7 dims (offset, depth, size, rot)
diff --git a/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
new file mode 100644
index 0000000000..f4c6193749
--- /dev/null
+++ b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
@@ -0,0 +1,112 @@
+_base_ = [
+    '../_base_/datasets/waymoD3-fov-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# load_from = '../Depth-from-Motion/checkpoints/pgd_init.pth'
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+                (4.73, 1.77, 2.08)),  # Car
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py
new file mode 100644
index 0000000000..2247aa44d8
--- /dev/null
+++ b/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py
@@ -0,0 +1,111 @@
+_base_ = [
+    '../_base_/datasets/waymoD3-mv-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+                (4.73, 1.77, 2.08)),  # Car
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/docs/en/user_guides/dataset_prepare.md b/docs/en/user_guides/dataset_prepare.md
index d1b7cad14e..17f368a9d0 100644
--- a/docs/en/user_guides/dataset_prepare.md
+++ b/docs/en/user_guides/dataset_prepare.md
@@ -133,10 +133,12 @@ sh tools/create_data.sh <partition> kitti
 
 ### Waymo
 
-Download Waymo open dataset V1.4.1 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `.tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split `.txt` files into `data/waymo/kitti_format/ImageSets`. Download ground truth `.bin` file for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_4_1/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare waymo data by running:
+Download Waymo open dataset V1.4.1 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `.tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split `.txt` files into `data/waymo/kitti_format/ImageSets`. Download ground truth `.bin` file for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare waymo data by running:
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 Note that:
@@ -149,28 +151,12 @@ Note that:
 
 - **Ready-made Annotations**. We have provided the annotation files generated offline [here](#summary-of-annotation-files). However, the original Waymo data still needs to be converted to `kitti-format` data by yourself.
 
-- **Waymo-mini**. If you just want to use a part of Waymo Dataset to verify some methods or debug quickly, you could use our provided [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_kitti_format.tar.gz) which only contains two segments in train split and one segment in val split from the original dataset. All the images, point clouds and annotations in this compressed file have been processed offline so that you can directly download and unzip it to `data/waymo/`:
+- **Waymo-mini**. If you just want to use a part of Waymo Dataset to verify some methods or debug quickly, you could use our provided [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) which only contains two segments in train split and one segment in val split from the original dataset. All the images, point clouds and annotations in this compressed file have been processed offline so that you can directly download and unzip it to `data/waymo/`:
 
   ```bash
-  tar -xzvf waymo_mini_kitti_format.tar.gz -C ./data/waymo
+  tar -xzvf waymo_mini.tar.gz -C ./data/waymo_mini
   ```
 
-- **Faster evaluation**. If you want faster evaluation on Waymo, you can download the preprocessed [metainfo](https://download.openmmlab.com/mmdetection3d/data/waymo/idx2metainfo.pkl) containing `contextname` and `timestamp` to the directory `data/waymo/waymo_format/` and then modify the dataset config as the following:
-
-  ```python
-  val_evaluator = dict(
-      type='WaymoMetric',
-      ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-      waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-      data_root='./data/waymo/waymo_format',
-      backend_args=backend_args,
-      convert_kitti_format=True,
-      idx2metainfo='data/waymo/waymo_format/idx2metainfo.pkl'
-      )
-  ```
-
-  Now, this trick is only used for LiDAR-based detection methods.
-
 ### NuScenes
 
 1. Download nuScenes V1.0 full dataset data [HERE](https://www.nuscenes.org/download). Alternatively, you
@@ -272,12 +258,12 @@ python tools/dataset_converters/update_infos_to_v2.py --dataset kitti --pkl-path
 
 We provide ready-made annotation files we generated offline for reference. You can directly use these files for convenice.
 
-|                                                        Dataset                                                         |                                                                                                           Train annotation file                                                                                                           |                                                                                                        Val annotation file                                                                                                         |                                                    Test information file                                                     |
-| :--------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: |
-|                                                         KITTI                                                          |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |               [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)               |
-|                                                        NuScenes                                                        | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                              |
-|                                                         Waymo                                                          |         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_train.pkl)  [waymo_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_infos_train.pkl)          |          [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_val.pkl)  [waymo_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_infos_val.pkl)          |             [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)             |
-| [Waymo-mini kitti-format data](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_kitti_format.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                              |
-|                                                       SUN RGB-D                                                        |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                              |
-|                                                        ScanNet                                                         |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |          [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)          |
-|                                                     SemanticKitti                                                      |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     | [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl) |
+|                                                  Dataset                                                  |                                                                                                           Train annotation file                                                                                                           |                                                                                                        Val annotation file                                                                                                         |                                                                                                              Test information file                                                                                                              |
+| :-------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                                                   KITTI                                                   |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |                                                                        [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)                                                                         |
+|                                                 NuScenes                                                  | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                                                                                                                                                 |
+|                                                   Waymo                                                   |                                                         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_train.pkl)                                                          |                                                        [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_val.pkl)                                                        | [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)   [waymo_infos_test_cam_only.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_test_cam_only.pkl) |
+| [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                                                                                                                                                 |
+|                                                 SUN RGB-D                                                 |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                                                                                                                                                 |
+|                                                  ScanNet                                                  |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |                                                                   [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)                                                                    |
+|                                               SemanticKitti                                               |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     |                                                          [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl)                                                           |
diff --git a/docs/zh_cn/user_guides/dataset_prepare.md b/docs/zh_cn/user_guides/dataset_prepare.md
index 917015dcff..983e988afb 100644
--- a/docs/zh_cn/user_guides/dataset_prepare.md
+++ b/docs/zh_cn/user_guides/dataset_prepare.md
@@ -111,10 +111,12 @@ python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitt
 
 ### Waymo
 
-在[这里](https://waymo.com/open/download/)下载 Waymo 公开数据集 1.2 版本，在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其数据划分文件。然后，将 `.tfrecord` 文件置于 `data/waymo/waymo_format/` 目录下的相应位置，并将数据划分的 `.txt` 文件置于 `data/waymo/kitti_format/ImageSets` 目录下。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集的真实标签（`.bin` 文件）并将其置于 `data/waymo/waymo_format/`。提示：你可以使用 `gsutil` 来用命令下载大规模的数据集。更多细节请参考此[工具](https://github.com/RalphMao/Waymo-Dataset-Tool)。完成以上各步后，可以通过运行以下指令对 Waymo 数据进行预处理：
+在[这里](https://waymo.com/open/download/)下载 Waymo 公开数据集 1.4.1 版本，在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其数据划分文件。然后，将 `.tfrecord` 文件置于 `data/waymo/waymo_format/` 目录下的相应位置，并将数据划分的 `.txt` 文件置于 `data/waymo/kitti_format/ImageSets` 目录下。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集的真实标签（`.bin` 文件）并将其置于 `data/waymo/waymo_format/`。提示：你可以使用 `gsutil` 来用命令下载大规模的数据集。更多细节请参考此[工具](https://github.com/RalphMao/Waymo-Dataset-Tool)。完成以上各步后，可以通过运行以下指令对 Waymo 数据进行预处理：
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 注意:
@@ -125,28 +127,12 @@ python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/way
 
 - **现成的标注文件**: 我们已经提供了离线处理好的 [Waymo 标注文件](#数据集标注文件列表)。您直接下载他们并放到 `data/waymo/kitti_format/` 目录下。然而，您还是需要自己使用上面的脚本将 Waymo 的原始数据还需要转成 kitti 格式。
 
-- **Waymo-mini**： 如果你只是为了验证某些方法或者 debug, 你可以使用我们提供的 [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_kitti_format.tar.gz)。它只包含原始数据集中训练集中的 2 个 segments 和 验证集中的 1 个 segment。您只需要下载并且解压到 `data/waymo/`，即可使用它：
+- **Waymo-mini**： 如果你只是为了验证某些方法或者 debug, 你可以使用我们提供的 [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz)。它只包含原始数据集中训练集中的 2 个 segments 和 验证集中的 1 个 segment。您只需要下载并且解压到 `data/waymo_mini/`，即可使用它：
 
   ```bash
-  tar -xzvf waymo_mini_kitti_format.tar.gz -C ./data/waymo
+  tar -xzvf waymo_mini.tar.gz -C ./data/waymo_mini
   ```
 
-- **更快的评估**： 如果你想在 Waymo 上进行更快的评估，你可以下载已经预处理好的[元信息文件](https://download.openmmlab.com/mmdetection3d/data/waymo/idx2metainfo.pkl)并将其放置在 `data/waymo/waymo_format/` 目录下。接着，你可以按照以下来修改数据集的配置：
-
-  ```python
-  val_evaluator = dict(
-      type='WaymoMetric',
-      ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-      waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-      data_root='./data/waymo/waymo_format',
-      backend_args=backend_args,
-      convert_kitti_format=True,
-      idx2metainfo='data/waymo/waymo_format/idx2metainfo.pkl'
-      )
-  ```
-
-  目前这种方式仅限于纯点云检测任务。
-
 ### NuScenes
 
 在[这里](https://www.nuscenes.org/download)下载 nuScenes 数据集 1.0 版本的完整数据文件。通过运行以下指令对 nuScenes 数据进行预处理：
@@ -222,12 +208,12 @@ python tools/dataset_converters/update_infos_to_v2.py --dataset kitti --pkl-path
 
 我们提供了离线生成好的数据集标注文件以供参考。为了方便，您也可以直接使用他们。
 
-|                                                         数据集                                                         |                                                                                                              训练集标注文件                                                                                                               |                                                                                                           验证集标注文件                                                                                                           |                                                        测试集标注文件                                                        |
-| :--------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: |
-|                                                         KITTI                                                          |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |               [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)               |
-|                                                        NuScenes                                                        | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                              |
-|                                                         Waymo                                                          |         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_train.pkl)  [waymo_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_infos_train.pkl)          |          [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_val.pkl)  [waymo_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_infos_val.pkl)          |             [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)             |
-| [Waymo-mini kitti-format data](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_mini_kitti_format.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                              |
-|                                                       SUN RGB-D                                                        |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                              |
-|                                                        ScanNet                                                         |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |          [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)          |
-|                                                     SemanticKitti                                                      |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     | [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl) |
+|                                                  数据集                                                   |                                                                                                              训练集标注文件                                                                                                               |                                                                                                           验证集标注文件                                                                                                           |                                                                                                                 测试集标注文件                                                                                                                  |
+| :-------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                                                   KITTI                                                   |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |                                                                        [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)                                                                         |
+|                                                 NuScenes                                                  | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                                                                                                                                                 |
+|                                                   Waymo                                                   |                                                         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_train.pkl)                                                          |                                                        [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_val.pkl)                                                        | [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)   [waymo_infos_test_cam_only.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_test_cam_only.pkl) |
+| [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                                                                                                                                                 |
+|                                                 SUN RGB-D                                                 |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                                                                                                                                                 |
+|                                                  ScanNet                                                  |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |                                                                   [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)                                                                    |
+|                                               SemanticKitti                                               |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     |                                                          [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl)                                                           |
diff --git a/mmdet3d/models/dense_heads/pgd_head.py b/mmdet3d/models/dense_heads/pgd_head.py
index c33ddb9e24..4a733b8691 100644
--- a/mmdet3d/models/dense_heads/pgd_head.py
+++ b/mmdet3d/models/dense_heads/pgd_head.py
@@ -1032,8 +1032,6 @@ def _predict_by_feat_single(self,
             # change the offset to actual center predictions
             bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2]
             if rescale:
-                bbox_pred3d[:, :2] /= bbox_pred3d[:, :2].new_tensor(
-                    scale_factor[0])
                 if self.pred_bbox2d:
                     bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor[0])
             if self.use_depth_classifier:
diff --git a/mmdet3d/models/detectors/dfm.py b/mmdet3d/models/detectors/dfm.py
index 736e43e826..7494c8546f 100644
--- a/mmdet3d/models/detectors/dfm.py
+++ b/mmdet3d/models/detectors/dfm.py
@@ -13,6 +13,9 @@ class DfM(BaseDetector):
         <https://arxiv.org/abs/2207.12988>`_.
 
     Args:
+        data_preprocessor (:obj:`ConfigDict` or dict): The pre-process
+            config of :class:`BaseDataPreprocessor`. it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
         backbone (:obj:`ConfigDict` or dict): The backbone config.
         neck (:obj:`ConfigDict` or dict): The neck config.
         backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
@@ -39,6 +42,7 @@ class DfM(BaseDetector):
     """
 
     def __init__(self,
+                 data_preprocessor: ConfigType,
                  backbone: ConfigType,
                  neck: ConfigType,
                  backbone_stereo: ConfigType,
@@ -53,7 +57,8 @@ def __init__(self,
                  test_cfg=None,
                  pretrained=None,
                  init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
         self.backbone = MODELS.build(backbone)
         self.neck = MODELS.build(neck)
         if backbone_stereo is not None:
diff --git a/mmdet3d/models/detectors/multiview_dfm.py b/mmdet3d/models/detectors/multiview_dfm.py
index fce4c92014..81446d30f2 100644
--- a/mmdet3d/models/detectors/multiview_dfm.py
+++ b/mmdet3d/models/detectors/multiview_dfm.py
@@ -1,20 +1,22 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
 import numpy as np
 import torch
-from mmdet.models.detectors import BaseDetector
+from mmengine.structures import InstanceData
+from torch import Tensor
 
 from mmdet3d.models.layers.fusion_layers.point_fusion import (point_sample,
                                                               voxel_sample)
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures.bbox_3d.utils import get_lidar2img
 from mmdet3d.structures.det3d_data_sample import SampleList
-from mmdet3d.utils import ConfigType, OptConfigType
+from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList
 from .dfm import DfM
-from .imvoxelnet import ImVoxelNet
 
 
 @MODELS.register_module()
-class MultiViewDfM(ImVoxelNet, DfM):
+class MultiViewDfM(DfM):
     r"""Waymo challenge solution of `MV-FCOS3D++
     <https://arxiv.org/abs/2207.12716>`_.
 
@@ -25,7 +27,7 @@ class MultiViewDfM(ImVoxelNet, DfM):
         config.
         backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
         neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
-        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        bbox_head_3d (:obj:`ConfigDict` or dict): The bbox head config.
         voxel_size (:obj:`ConfigDict` or dict): The voxel size.
         anchor_generator (:obj:`ConfigDict` or dict): The anchor generator
             config.
@@ -60,7 +62,7 @@ def __init__(self,
                  backbone_stereo: ConfigType,
                  backbone_3d: ConfigType,
                  neck_3d: ConfigType,
-                 bbox_head: ConfigType,
+                 bbox_head_3d: ConfigType,
                  voxel_size: ConfigType,
                  anchor_generator: ConfigType,
                  neck_2d: ConfigType = None,
@@ -71,41 +73,24 @@ def __init__(self,
                  test_cfg: OptConfigType = None,
                  data_preprocessor: OptConfigType = None,
                  valid_sample: bool = True,
-                 temporal_aggregate: str = 'concat',
+                 temporal_aggregate: str = 'mean',
                  transform_depth: bool = True,
                  init_cfg: OptConfigType = None):
-        # TODO merge with DFM
-        BaseDetector.__init__(
-            self, data_preprocessor=data_preprocessor, init_cfg=init_cfg)
-
-        self.backbone = MODELS.build(backbone)
-        self.neck = MODELS.build(neck)
-        if backbone_stereo is not None:
-            backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
-            backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
-            self.backbone_stereo = MODELS.build(backbone_stereo)
-            assert self.neck.cat_img_feature == \
-                self.backbone_stereo.cat_img_feature
-            assert self.neck.sem_channels[
-                -1] == self.backbone_stereo.in_sem_channels
-        if backbone_3d is not None:
-            self.backbone_3d = MODELS.build(backbone_3d)
-        if neck_3d is not None:
-            self.neck_3d = MODELS.build(neck_3d)
-        if neck_2d is not None:
-            self.neck_2d = MODELS.build(neck_2d)
-        if bbox_head_2d is not None:
-            self.bbox_head_2d = MODELS.build(bbox_head_2d)
-        if depth_head_2d is not None:
-            self.depth_head_2d = MODELS.build(depth_head_2d)
-        if depth_head is not None:
-            self.depth_head = MODELS.build(depth_head)
-            self.depth_samples = self.depth_head.depth_samples
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        bbox_head.update(train_cfg=train_cfg)
-        bbox_head.update(test_cfg=test_cfg)
-        self.bbox_head = MODELS.build(bbox_head)
+        super().__init__(
+            data_preprocessor=data_preprocessor,
+            backbone=backbone,
+            neck=neck,
+            backbone_stereo=backbone_stereo,
+            backbone_3d=backbone_3d,
+            neck_3d=neck_3d,
+            bbox_head_3d=bbox_head_3d,
+            neck_2d=neck_2d,
+            bbox_head_2d=bbox_head_2d,
+            depth_head_2d=depth_head_2d,
+            depth_head=depth_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
         self.voxel_size = voxel_size
         self.voxel_range = anchor_generator['ranges'][0]
         self.n_voxels = [
@@ -371,6 +356,139 @@ def feature_transformation(self, batch_feats, batch_img_metas, num_views,
             transform_feats += (batch_stereo_feats, )
         return transform_feats
 
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        losses = self.bbox_head_3d.loss([bev_feat], batch_data_samples)
+        return losses
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+                (num_instances, C) where C >=7.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        results_list = self.bbox_head_3d.predict([bev_feat],
+                                                 batch_data_samples)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
+        return predictions
+
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: SampleList = None):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        self.bbox_head.forward(bev_feat, batch_data_samples)
+
+    def add_pred_to_datasample(
+        self,
+        data_samples: SampleList,
+        data_instances_3d: OptInstanceList = None,
+        data_instances_2d: OptInstanceList = None,
+    ) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Subclasses could override it to be compatible for some multi-modality
+        3D detectors.
+
+        Args:
+            data_samples (list[:obj:`Det3DDataSample`]): The input data.
+            data_instances_3d (list[:obj:`InstanceData`], optional): 3D
+                Detection results of each sample.
+            data_instances_2d (list[:obj:`InstanceData`], optional): 2D
+                Detection results of each sample.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+
+            When there are image prediction in some models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+
+        assert (data_instances_2d is not None) or \
+               (data_instances_3d is not None),\
+               'please pass at least one type of data_samples'
+
+        if data_instances_2d is None:
+            data_instances_2d = [
+                InstanceData() for _ in range(len(data_instances_3d))
+            ]
+        if data_instances_3d is None:
+            data_instances_3d = [
+                InstanceData() for _ in range(len(data_instances_2d))
+            ]
+
+        for i, data_sample in enumerate(data_samples):
+            data_sample.pred_instances_3d = data_instances_3d[i]
+            data_sample.pred_instances = data_instances_2d[i]
+        return data_samples
+
     def aug_test(self, imgs, img_metas, **kwargs):
         """Test with augmentations.
 

From 0ef13b83fac1eaad626c9989feec6b1a0b2588e7 Mon Sep 17 00:00:00 2001
From: Sun Jiahao <72679458+sunjiahao1999@users.noreply.github.com>
Date: Mon, 8 Jan 2024 15:22:00 +0800
Subject: [PATCH 7/7] Bump to v1.4.0 (#2866)

---
 .circleci/test.yml         |  8 +++++---
 README.md                  | 10 ++++++++--
 README_zh-CN.md            | 12 ++++++++----
 docker/serve/Dockerfile    |  4 ++--
 docs/en/notes/changelog.md | 32 ++++++++++++++++++++++++++++++++
 docs/en/notes/faq.md       |  4 ++--
 docs/zh_cn/notes/faq.md    |  4 ++--
 mmdet3d/__init__.py        |  2 +-
 mmdet3d/version.py         |  2 +-
 9 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/.circleci/test.yml b/.circleci/test.yml
index 0324441874..19a25a8610 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -85,7 +85,7 @@ jobs:
         type: string
       cuda:
         type: enum
-        enum: ["11.1", "11.7"]
+        enum: ["10.2", "11.7"]
       cudnn:
         type: integer
         default: 8
@@ -173,7 +173,8 @@ workflows:
           torch: 1.8.1
           # Use double quotation mark to explicitly specify its type
           # as string instead of number
-          cuda: "11.1"
+          cuda: "10.2"
+          cudnn: 7
           requires:
             - hold
       - build_cuda:
@@ -190,7 +191,8 @@ workflows:
       - build_cuda:
           name: minimum_version_gpu
           torch: 1.8.1
-          cuda: "11.1"
+          cuda: "10.2"
+          cudnn: 7
           filters:
             branches:
               only:
diff --git a/README.md b/README.md
index 4c9be76b51..1b02f58824 100644
--- a/README.md
+++ b/README.md
@@ -104,9 +104,15 @@ Like [MMDetection](https://github.com/open-mmlab/mmdetection) and [MMCV](https:/
 
 ### Highlight
 
-**We have renamed the branch `1.1` to `main` and switched the default branch from `master` to `main`. We encourage users to migrate to the latest version, though it comes with some cost. Please refer to [Migration Guide](docs/en/migration.md) for more details.**
+In version 1.4, MMDetecion3D refactors the Waymo dataset and accelerates the preprocessing, training/testing setup, and evaluation of Waymo dataset. We also extends the support for camera-based, such as Monocular and BEV, 3D object detection models on Waymo. A detailed description of the Waymo data information is provided [here](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html).
 
-We have constructed a comprehensive LiDAR semantic segmentation benchmark on SemanticKITTI, including Cylinder3D, MinkUNet and SPVCNN methods. Noteworthy, the improved MinkUNetv2 can achieve 70.3 mIoU on the validation set of SemanticKITTI. We have also supported the training of BEVFusion and an occupancy prediction method, TPVFomrer, in our `projects`. More new features about 3D perception are on the way. Please stay tuned!
+Besides, in version 1.4, MMDetection3D provides [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) to help community users get started with Waymo and use it for quick iterative development.
+
+**v1.4.0** was released in 8/1/2024：
+
+- Support the training of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects`
+- Support [Nerf-Det](https://arxiv.org/abs/2307.14620) in `projects`
+- Refactor Waymo dataset
 
 **v1.3.0** was released in 18/10/2023:
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 7305fc50d9..330373d019 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -104,9 +104,15 @@ MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱，下一代
 
 ### 亮点
 
-**我们将 `1.1` 分支重命名为 `main` 并将默认分支从 `master` 切换到 `main`。我们鼓励用户迁移到最新版本，请参考 [迁移指南](docs/en/migration.md)以了解更多细节。**
+在1.4版本中，MMDetecion3D 重构了 Waymo 数据集, 加速了 Waymo 数据集的预处理、训练/测试启动、验证的速度。并且在 Waymo 上拓展了对 单目/BEV 等基于相机的三维目标检测模型的支持。在[这里](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html)提供了对 Waymo 数据信息的详细解读。
 
-我们在 SemanticKITTI 上构建了一个全面的点云语义分割基准，包括 Cylinder3D 、MinkUNet 和 SPVCNN 方法。其中，改进后的 MinkUNetv2 在验证集上可以达到 70.3 mIoU。我们还在 `projects` 中支持了 BEVFusion 的训练和全新的 3D 占有网格预测网络 TPVFormer。更多关于 3D 感知的新功能正在进行中。请继续关注！
+此外，在1.4版本中，MMDetection3D 提供了 [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) 来帮助社区用户上手 Waymo 并用于快速迭代开发。
+
+**v1.4.0** 版本已经在 2024.1.8 发布：
+
+- 在 `projects` 中支持了 [DSVT](<(https://arxiv.org/abs/2301.06051)>) 的训练
+- 在 `projects` 中支持了 [Nerf-Det](https://arxiv.org/abs/2307.14620)
+- 重构了 Waymo 数据集
 
 **v1.3.0** 版本已经在 2023.10.18 发布：
 
@@ -171,8 +177,6 @@ MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱，下一代
 
 ## 基准测试和模型库
 
-## 基准测试和模型库
-
 测试结果和模型可以在[模型库](docs/zh_cn/model_zoo.md)中找到。
 
 <div align="center">
diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile
index 743a278402..31b87dd521 100644
--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@@ -4,8 +4,8 @@ ARG CUDNN="8"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
 ARG MMCV="2.0.0rc4"
-ARG MMDET="3.1.0"
-ARG MMDET3D="1.3.0"
+ARG MMDET="3.3.0"
+ARG MMDET3D="1.4.0"
 
 ENV PYTHONUNBUFFERED TRUE
 
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index c5da3d4e1f..fd038b8b5f 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,37 @@
 # Changelog of v1.1
 
+### v1.4.0 (8/1/2024)
+
+#### Highlights
+
+- Refactor Waymo dataset (#2836)
+- Support the training of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects` (#2738)
+- Support [Nerf-Det](https://arxiv.org/abs/2307.14620) in `projects` (#2732)
+
+#### New Features
+
+- Support the training of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects` (#2738)
+- Support [Nerf-Det](https://arxiv.org/abs/2307.14620) in `projects` (#2732)
+- Support [MV-FCOS3D++](https://arxiv.org/abs/2207.12716)
+- Refactor Waymo dataset (#2836)
+
+#### Improvements
+
+- Support [PGD](https://arxiv.org/abs/2107.14160)) (front-of-view / multi-view) on Waymo dataset (#2835)
+- Release new [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) for verify some methods or debug quickly (#2835)
+
+#### Bug Fixes
+
+- Fix MinkUNet and SPVCNN some wrong configs (#2854)
+- Fix incorrect number of arguments in PETR (#2800)
+- Delete unused files in `mmdet3d/configs` (#2773)
+
+#### Contributors
+
+A total of 5 developers contributed to this release.
+
+@sunjiahao1999, @WendellZ524, @Yanyirong, @JingweiZhang12, @Tai-Wang
+
 ### v1.3.0 (18/10/2023)
 
 #### Highlights
diff --git a/docs/en/notes/faq.md b/docs/en/notes/faq.md
index becd3b9191..0a28fd0150 100644
--- a/docs/en/notes/faq.md
+++ b/docs/en/notes/faq.md
@@ -10,8 +10,8 @@ We list some potential troubles encountered by users and developers, along with
 
   | MMDetection3D version |     MMEngine version     |      MMCV version       |   MMDetection version    |
   | --------------------- | :----------------------: | :---------------------: | :----------------------: |
-  | dev-1.x               | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.3.0 |
-  | main                  | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.3.0 |
+  | main                  | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.4.0 |
+  | v1.4.0                | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.4.0 |
   | v1.3.0                | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.3.0 |
   | v1.2.0                | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |  mmdet>=3.0.0, \<3.2.0   |
   | v1.1.1                | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |  mmdet>=3.0.0, \<3.1.0   |
diff --git a/docs/zh_cn/notes/faq.md b/docs/zh_cn/notes/faq.md
index 68ed122264..3e1199cd78 100644
--- a/docs/zh_cn/notes/faq.md
+++ b/docs/zh_cn/notes/faq.md
@@ -10,8 +10,8 @@
 
   | MMDetection3D 版本 |      MMEngine 版本       |        MMCV 版本        |     MMDetection 版本     |
   | ------------------ | :----------------------: | :---------------------: | :----------------------: |
-  | dev-1.x            | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.3.0 |
-  | main               | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.3.0 |
+  | main               | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.4.0 |
+  | v1.4.0             | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.4.0 |
   | v1.3.0             | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.3.0 |
   | v1.2.0             | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |  mmdet>=3.0.0, \<3.2.0   |
   | v1.1.1             | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |  mmdet>=3.0.0, \<3.1.0   |
diff --git a/mmdet3d/__init__.py b/mmdet3d/__init__.py
index 5f7c341783..17c1777315 100644
--- a/mmdet3d/__init__.py
+++ b/mmdet3d/__init__.py
@@ -15,7 +15,7 @@
 mmengine_version = digit_version(mmengine.__version__)
 
 mmdet_minimum_version = '3.0.0rc5'
-mmdet_maximum_version = '3.3.0'
+mmdet_maximum_version = '3.4.0'
 mmdet_version = digit_version(mmdet.__version__)
 
 assert (mmcv_version >= digit_version(mmcv_minimum_version)
diff --git a/mmdet3d/version.py b/mmdet3d/version.py
index f45a3d5a3a..e7163ba86e 100644
--- a/mmdet3d/version.py
+++ b/mmdet3d/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 
-__version__ = '1.3.0'
+__version__ = '1.4.0'
 short_version = __version__