
[{"type":"Publication","title":"VectorWorld: Efficient Streaming World Model via Diffusion Flow on Vector Graphs","url":"/publications/#jiang2026vectorworld","summary":"A streaming world model for autonomous-driving scenarios built on vector-graph diffusion flow.","meta":"ICML · 2026","tags":"world-model generative e2e"},{"type":"Publication","title":"RegFormer++: An Efficient Large-Scale 3D LiDAR Point Registration Network with Projection-Aware 2D Transformer","url":"/publications/#liu2026regformerpp","summary":"A large-scale 3D LiDAR point registration network with projection-aware transformer design.","meta":"arXiv · 2026","tags":"lidar-odometry perception"},{"type":"Publication","title":"DifFlow3D: Hierarchical Diffusion Models for Uncertainty-Aware 3D Scene Flow Estimation","url":"/publications/#liu2026difflow3d","summary":"A journal version of diffusion-based uncertainty-aware 3D scene flow estimation.","meta":"T-PAMI · 2026","tags":"scene-flow generative perception"},{"type":"Publication","title":"Unsupervised Learning of 3D Scene Flow With LiDAR Odometry Assistance","url":"/publications/#wang2025unsupervised","summary":"An unsupervised 3D scene flow learning method assisted by LiDAR odometry.","meta":"TITS · 2025","tags":"scene-flow lidar-odometry perception"},{"type":"Publication","title":"Mamba4D: Efficient 4D Point Cloud Video Understanding with Disentangled Spatial-Temporal State Space Models","url":"/publications/#liu2025mamba4d","summary":"A 4D point cloud video understanding framework based on disentangled spatial-temporal state space modeling.","meta":"CVPR · 2025","tags":"perception 3d-4d"},{"type":"Publication","title":"GMF-Drive: Gated Mamba Fusion with Spatial-Aware BEV Representation for End-to-End Autonomous Driving","url":"/publications/#wang2025gmfdrive","summary":"An end-to-end autonomous-driving model with gated Mamba fusion and spatial-aware BEV representation.","meta":"arXiv · 2025","tags":"e2e perception world-model"},{"type":"Publication","title":"D^2GSLAM: 4D Dynamic Gaussian Splatting SLAM","url":"/publications/#zhu2025d2gslam","summary":"A dynamic SLAM framework based on 4D Gaussian splatting.","meta":"arXiv · 2025","tags":"generative perception lidar-odometry"},{"type":"Publication","title":"DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with Iterative Diffusion-Based Refinement","url":"/publications/#liu2024difflow3d","summary":"A diffusion-based 3D scene flow method for robust and uncertainty-aware point motion estimation.","meta":"CVPR · 2024","tags":"scene-flow generative perception"},{"type":"Publication","title":"3DSFLabelling: Boosting 3D Scene Flow Estimation by Pseudo Auto-labelling","url":"/publications/#jiang20243dsflabelling","summary":"A pseudo auto-labeling framework that provides high-quality 3D motion flow labels for LiDAR point clouds.","meta":"CVPR · 2024","tags":"scene-flow perception 3d-4d"},{"type":"Publication","title":"NeuroGauss4D-PCI: 4D Neural Fields and Gaussian Deformation Fields for Point Cloud Interpolation","url":"/publications/#jiang2024neurogauss4d","summary":"A 4D point cloud interpolation framework using neural fields and Gaussian deformation fields.","meta":"NeurIPS · 2024","tags":"generative perception 3d-4d"},{"type":"Publication","title":"3-D Scene Flow Estimation on Pseudo-LiDAR: Bridging the Gap on Estimating Point Motion","url":"/publications/#jiang2023scene","summary":"A 3D scene flow estimation study on pseudo-LiDAR point representations.","meta":"TII · 2023","tags":"scene-flow perception"},{"type":"Publication","title":"Pseudo-LiDAR for Visual Odometry","url":"/publications/#miao2023pseudo","summary":"A visual odometry method using pseudo-LiDAR representations.","meta":"TIM · 2023","tags":"lidar-odometry perception"},{"type":"Publication","title":"TransLO: A Window-Based Masked Point Transformer Framework for Large-Scale LiDAR Odometry","url":"/publications/#liu2023translo","summary":"A masked point transformer framework for large-scale LiDAR odometry.","meta":"AAAI · 2023","tags":"lidar-odometry perception"},{"type":"Publication","title":"RegFormer: An Efficient Projection-Aware Transformer Network for Large-Scale Point Cloud Registration","url":"/publications/#liu2023regformer","summary":"A projection-aware transformer network for large-scale point cloud registration.","meta":"ICCV · 2023","tags":"lidar-odometry perception"},{"type":"Publication","title":"SFGAN: Unsupervised Generative Adversarial Learning of 3D Scene Flow from the 3D Scene Self","url":"/publications/#wang2022sfgan","summary":"An unsupervised adversarial learning framework for 3D scene flow.","meta":"AIS · 2022","tags":"scene-flow generative perception"},{"type":"Publication","title":"FFPA-Net: Efficient Feature Fusion with Projection Awareness for 3D Object Detection","url":"/publications/#jiang2022ffpanet","summary":"A projection-aware feature fusion method for 3D object detection.","meta":"arXiv · 2022","tags":"perception"}
,{"type":"Project","title":"4D Auto-Labeling & Pure LiDAR 3D Detection","url":"/projects/auto-labeling-4d-lidar/","summary":"A two-phase journey in autonomous-driving auto-labeling: first a Tesla-AI-Day-inspired vision-only 4D auto-labeling pipeline with Hozon Auto and SJTU IRMV, then a multi-modal 4D auto-labeling and production pure-LiDAR 3D detection system at PhiGent Robotics — optimized at the data, model, and loss levels.","meta":"2022.11–2024","tags":"production research 3d-4d perception deployment"},{"type":"Project","title":"Autonomous Lawn-Mower Robot Perception","url":"/projects/autonomous-lawn-mower-robot/","summary":"The robot has to drive itself off a transport vehicle, reach the lawn, mow, and return — so I built its safety-critical perception stack across four modules: ramp detection for self loading/unloading, 3D grass-obstacle detection (geometry first, then camera–LiDAR fusion), an MCU-deployed 2D BEV safety detector, and a dual-attention LiDAR–vision fusion study.","meta":"2021.09–2023.03","tags":"production research robotics 3d-4d deployment"},{"type":"Project","title":"One-Stage End-to-End Driving — 8V Pure Vision","url":"/projects/e2e-7v-vision/","summary":"A one-stage, pure-vision end-to-end driving POC that lifts 8 surround cameras into a single BEV feature, reads three structured perception heads (3D detection, HD map, occupancy) from it, predicts the next-frame BEV under generative scoring, and tokenizes everything into a Diffusion-Flow planner that emits the ego trajectory and neighbouring-agent states — perception, prediction, and planning optimised jointly.","meta":"2025.12–2026.04","tags":"research e2e world-model perception"},{"type":"Project","title":"End-to-End Driving: 11V + LiDAR Fusion","url":"/projects/e2e-lidar-11v-fusion/","summary":"An end-to-end autonomous-driving system that fuses 11 surround cameras (7 pinhole + 4 fisheye) with LiDAR under a sparse-centric (SparseDrive-style) paradigm. My two core deliverables: a fused BEV-fusion CUDA operator that aligns 11-camera and LiDAR features in a single kernel, and the training of an AI planner that outputs motion and planning in parallel from a shared query decoder.","meta":"2024 · Collaboration","tags":"research e2e perception 3d-4d"},{"type":"Project","title":"Generative Autonomous-Driving Simulation Platform","url":"/projects/generative-ad-simulation-platform/","summary":"Built a Cosmos-Transfer2.5-based generative simulation platform: a 7V surround world model validated on internal data, real-map (Ingolstadt OSM → layout → 7V) scenario generation, a gRPC semantic bridge between WorldSim and the world model, the first 4-step distillation of 7V surround video (rCM + DMD2) for up to ~13.9× speedup, an editable platform for rare interaction data, and an all-in-one OneModel that serves layout generation, Gaussian-Splatting fix, and harmonization from a single denoiser.","meta":"2025.03–Present","tags":"research world-model generative e2e"},{"type":"Project","title":"Master's Graduation — Introduction Film","url":"/projects/graduation-intro-video/","summary":"A short introduction film I made for my 2023 master's graduation — a compact tour of my research focus, the labs and mentors I worked with, and the perception and 3D/4D systems I built along the way.","meta":"2023.06","tags":"profile"},{"type":"Project","title":"Integrated Perception, Planning, and Decision-Making Network","url":"/projects/integrated-perception-planning-decision/","summary":"A unified multi-task framework that fuses multi-modal sensors (RGB, LiDAR, infrared) through attention-based feature fusion — jointly solving geometric–semantic mapping, unsupervised depth and odometry, multi-object detection and tracking, and closed-loop behavior decisions inside one end-to-end trainable network.","meta":"2021.08–2022.10","tags":"research e2e perception"},{"type":"Project","title":"J6M Static & Dynamic Perception","url":"/projects/j6m-static-dynamic-perception/","summary":"End-to-end production perception on a mid-trim (J6E / J6M) platform, organised around three shipped systems: a multi-task static OneModel that drives every static element from one shared BEV feature, a 4D-sparse dynamic model that unifies detection and tracking, and an on-board latency-compression effort that cut inference from ~42.65 ms to ~13.88 ms. My work spans architecture, a unified data pipeline, heterogeneous multi-task training, release engineering, and quantization-aware deployment.","meta":"2025.04–2026.03","tags":"production perception bev deployment quantization"},{"type":"Project","title":"Controllable Surround-View Driving Generation","url":"/projects/phigent-gen-model/","summary":"Built a controllable surround-view driving generator that compresses 3D boxes and maps into spatial conditions, encodes text / reference frames / lanes / camera calibration into condition tokens, and injects them into a UNet diffusion backbone — producing cross-camera-consistent 4V / 7V / 11V images and video for data augmentation and open-loop simulation, evolving from OpenSora 1.0 + SD 3.5 to a MagicDrive-fused in-house model.","meta":"2023.05–2024","tags":"research world-model generative e2e"},{"type":"Project","title":"Road Preview: Surface-Element Segmentation","url":"/projects/road-preview-disparity/","summary":"A road-surface perception project for the road-preview ('magic-carpet') suspension feature: segment safety-critical small road elements — manhole covers and speed bumps — reliably under hard real-world conditions (tiny targets, water and oil stains, textureless surfaces), then compress and quantize the model to INT8 for efficient TDA4 edge inference, reaching an initial mass-production quality bar.","meta":"2023.05–2023.11","tags":"production perception deployment"},{"type":"Project","title":"3D Scene Flow: Auto-Labeling & Production Deployment","url":"/projects/scene-flow-deployment/","summary":"A 3D motion-estimation stack for autonomous driving: an unsupervised auto-labeling system that assigns a 3D scene-flow vector to every LiDAR point and every occupancy cell, validated by lifting the accuracy of existing flow estimators, distilled into an ultra-light production head, and deployed end-to-end through ONNX, TensorRT (Orin) and the Horizon J6E toolchain.","meta":"2023.08–2023.12","tags":"research 3d-4d scene-flow deployment"},{"type":"Project","title":"Vector Traffic Generation & Sensor-Level Closed-Loop Simulation","url":"/projects/traffic-sensor-level-ctrl/","summary":"Built a two-level controllable driving simulator: a structure-aware temporal vector VAE (STAR-AE) that compresses sparse, variable agents and lanes into fixed latents, a conditional latent-diffusion generator (STRIDENet) that produces history-consistent future traffic, and a sensor-level closed-loop WorldSim that fuses Gaussian-Splatting reconstruction, traffic-flow generation, and a mask-guided DiT video editor (built on MagicDrive-V2) into photorealistic surround rollouts.","meta":"2025.05–Present","tags":"research world-model generative e2e"}
,{"type":"Blog","title":"GE-Sim 2.0: a closed-loop video world simulator for manipulation","url":"/blog/2026/07/01/ge-sim-2-closed-loop-simulator/","summary":"A talk on AgiBot Genie's GE-Sim 2.0 — turning a video generator into an environment robots can actually train and test in, via a state expert, a VLM world judge, and pixel-aligned action conditioning.","meta":"Jul 2026","tags":"world-model robotics talk"},{"type":"Blog","title":"Embodied AI, full stack: body, brain, and safety","url":"/blog/2026/06/30/embodied-ai-full-stack/","summary":"A three-part survey of embodied intelligence — robot bodies, simulators, perception and grasping; world models; and the duality of capability and risk across perceive, cognize, plan, act.","meta":"Jun 2026","tags":"embodied-ai talk"},{"type":"Blog","title":"Full-stack autonomous driving: from sensors to control","url":"/blog/2026/06/29/full-stack-autonomous-driving/","summary":"A talk through a complete AD stack — dual-brain domain control, multi-view + LiDAR perception, BEV and occupancy, online mapping, tracking, planning, control, parking, and the data flywheel.","meta":"Jun 2026","tags":"autonomous-driving talk"},{"type":"Blog","title":"One-step generative models: the MeanFlow line, paper by paper","url":"/blog/2026/06/28/one-step-generative-models/","summary":"A four-paper walkthrough of He Kaiming's group compressing diffusion's hundreds of steps down to one — MeanFlow, improved iMF, drift-based modeling, and FID-as-a-loss.","meta":"Jun 2026","tags":"generative talk"},{"type":"Blog","title":"OmniDreams: a real-time generative world model for driving","url":"/blog/2026/06/27/omnidreams-realtime-world-model/","summary":"A talk walkthrough of NVIDIA's OmniDreams — a causal autoregressive DiT world model that closes the loop between policy and simulation, with three-stage training to kill error accumulation.","meta":"Jun 2026","tags":"world-model talk"},{"type":"Blog","title":"从 3DGRUT 到 ArtiFixer3D：一个“先重建、再生成、再蒸馏回 3D”的闭环","url":"/blog/2026/06/26/from-3dgrut-to-artifixer3d/","summary":"Reading ArtiFixer (SIGGRAPH 2026, NVIDIA Toronto AI Lab) — an interactive 3D logic atlas of the reconstruct → generate → distill-back-to-3D loop.","meta":"Jun 2026","tags":"world-model 3d generative reading"},{"type":"Blog","title":"从互联网到物理世界：具身智能的数据革命从哪里开始？","url":"/blog/2026/06/25/embodied-ai-data-revolution/","summary":"一份关于具身智能数据采集的工作笔记——在保真·接触、规模·成本、跨本体三者难以兼得的约束下，遥操作、仿真、UMI、Ego 各自下了什么赌注。","meta":"Jun 2026","tags":"embodied-ai data vla reading"},{"type":"Blog","title":"从提示词到 Agent 自动化的发展路径","url":"/blog/2026/06/24/prompt-to-agent-automation/","summary":"AI Agent 工程的四个阶段——提示词、上下文、harness、loop——不是替代关系而是嵌套关系。瓶颈每往后一层就迁移一次。","meta":"Jun 2026","tags":"llm agent engineering notes"},{"type":"Blog","title":"机器人怎么\"练\"出来：在真实世界里，还是在想象里","url":"/blog/2026/06/23/robot-rl-real-vs-imagination/","summary":"以 RECAP / π*0.6（真实世界做 RL）和 RISE（世界模型内做 RL）为两条主线，讲清楚 VLA 为什么单靠模仿学不好、两种破局路线各自的本质与代价。","meta":"Jun 2026","tags":"vla world-model rl reading"}]
