
CRONOSプロジェクトは、多様な移動体と人間を繋ぐ先進的なコミュニケーションプラットフォームの開発を目指しています。私たちの研究は、V2V/V2Xシステム、大規模言語モデル、VRシミュレーション技術を統合し、次世代のモビリティ環境におけるシームレスで安全な、人間中心のインタラクションを実現することに焦点を当てています。
基盤モデルを使用した異種通信メッセージのための堅牢なプラットフォームの開発
多様なニーズとシナリオに対応する進化するコミュニケーションプラットフォームの作成
人間と移動体AIの間のシームレスなコミュニケーションのためのインタラクションの強化
人間-移動体AI共創型交通システムのためのコミュニケーションプラットフォームの展示
autonomous driving machine learning
machine learning uav
autonomous driving v2x
open source
v2x
open data
@inproceedings{Tao2026,
title = {TacitCollab: A Plug-and-Play V2X Safety Filter via Free-Form LLM Dialogue},
author = {Ye Tao and Manabu Tsukada and Hiroshi Esaki},
year = {2026},
date = {2026-06-09},
urldate = {2026-06-09},
booktitle = {2026 IEEE 103rd Vehicular Technology Conference (VTC2026-Spring)},
address = {Nice, France},
abstract = {Autonomous vehicles increasingly rely on Vehicle-to-Everything (V2X) communication to coordinate in complex traffic scenarios. However, the existing approaches typically exchange predefined message formats such as waypoints or discrete actions, limiting their ability to handle exotic situations
that require nuanced negotiation. We propose TacitCollab, a V2X dialogue system that enables vehicles to coordinate collision avoidance through free-form natural language communication. TacitCollab leverages vision-language models to interpret traffic situations and engage in human-like dialogue, allowing vehicles
to share observations, negotiate priorities, and reach mutual agreements. The system is designed as a plug-and-play module that can be inserted into existing autonomous driving pipelines, accepting intentions in arbitrary formats and outputting updated intentions in the same format. We evaluate TacitCollab on 238 collision scenarios from the DeepAccident dataset. Results show a Collision Avoidance Ratio of 97.4%, with 84.2% of scenarios achieving coordinated outcomes where vehicles successfully
negotiate right-of-way. The average bandwidth of 1.17 KB per scenario demonstrates that text-based dialogue is practical for bandwidth-constrained V2X networks.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{Li2026b,
title = {Aligning What Matters: Object-Centric and Structure-Aware Feature Alignment for Heterogeneous Collaborative Perception},
author = {Donghui Li and Ehsan Javanmardi and Manabu Tsukada},
year = {2026},
date = {2026-06-09},
booktitle = {2026 IEEE 103rd Vehicular Technology Conference (VTC2026-Spring)},
address = {Nice, France},
abstract = {Collaborative perception enhances autonomous driving by enabling vehicles to jointly reason about the environment. Feature-level sharing is particularly attractive, as it preserves rich spatial information and allows uncertain regions to be jointly refined before object inference. However, in realistic deployments, vehicles often rely on heterogeneous perception backbones and preprocessing pipelines, resulting in substantial domain gaps that make feature fusion unreliable. Recent protocol-domain adapter frameworks mitigate this issue by mapping heterogeneous features into a shared latent space, but their alignment objectives are typically defined over the entire Bird’s Eye View (BEV) grid. Since foreground objects occupy only a small fraction of BEV space, such global reconstruction objectives are dominated by background regions and provide weak supervision for aligning detection-critical features. Moreover, purely global alignment does not explicitly preserve fine-grained spatial structures that are essential for accurate 3D object detection.
In this work, we revisit protocol-domain alignment from the perspective of detection relevance and structural consistency. We propose an object-centric and structure-aware adapter for heterogeneous collaborative perception. Specifically, we introduce BEV foreground masks derived from 3D bounding boxes to emphasize object regions during alignment, and incorporate a Structural Similarity Index Measure (SSIM)–based objective to enforce fine-grained structural consistency across heterogeneous feature domains. Importantly, the proposed method preserves the plug-and-play property of protocol-domain collaboration and requires no modification of the original perception backbones or fusion modules.
Extensive experiments under diverse heterogeneous configurations demonstrate consistent improvements over protocol-domain adapter baselines, achieving up to +0.19 AP@0.7 under challenging modality settings.
},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{Li2026c,
title = {V2XState: Intent-aware Spatial Basis Attention for Cooperative End-to-end Driving},
author = {Dongyang Li and Ehsan Javanmardi and Manabu Tsukada},
year = {2026},
date = {2026-06-09},
urldate = {2026-06-09},
booktitle = {2026 IEEE 103rd Vehicular Technology Conference (VTC2026-Spring)},
address = {Nice, France},
abstract = {Cooperative autonomous driving relies on shared perception, yet most planning models implicitly treat spatial relevance as scene-driven and largely invariant to the ego vehicle's intent. We argue that spatial relevance for planning should be intent-conditioned: which regions matter depends on the specific maneuver being executed. This paper proposes V2XState, a framework that operationalizes this insight by using ego states and commands to modulate spatial attention over cooperative features. By incorporating kinematic-aware inductive biases through a state-gated basis attention mechanism, V2XState yields context-sensitive emphasis that aligns planning with current driving intent. We integrate this mechanism into a lightweight planning stack and observe that the resulting attention maps are both interpretable and intent-consistent. Experiments on cooperative driving benchmarks validate that intent-aware spatial attention leads to more maneuver-consistent planning, achieving a 8.2%/7.2% reduction in ADE/FDE relative to state-of-the-art baselines.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{Aizono2026,
title = {RealSim-CP: A High-Fidelity Multimodal Cooperative Perception Dataset Bridging the Simulator–Real World Gap},
author = {Yuji Aizono and Ehsan Javanmardi and Fardin Ayar and Mahdi Javanmardi and Manabu Tsukada and Hiroshi Esaki},
year = {2026},
date = {2026-06-09},
booktitle = {2026 IEEE 103rd Vehicular Technology Conference (VTC2026-Spring)},
address = {Nice, France},
abstract = {Cooperative perception, which enables vehicles to share sensory information with other vehicles and roadside infrastructure, is essential for advancing autonomous driving beyond single-vehicle limitations. However, existing cooperative perception datasets suffer from two critical limitations: the lack of representation of Japanese traffic environments with their unique characteristics (e.g., left-hand traffic, distinctive vehicle types, and region-specific infrastructure), and the high cost of real-world data collection that constrains dataset scale and diversity.
To address these challenges, we present RealSim-CP, a novel cooperative perception dataset generated using the Driving Intelligence Validation Platform (DIVP), a physics-based simulation system that employs ray tracing and electromagnetic-wave modeling to produce highly realistic sensor data comparable to real-world quality. Leveraging DIVP’s high-fidelity simulation capabilities, we efficiently generate a large-scale dataset comprising synchronized multimodal data—camera images and LiDAR point clouds—from multiple cooperative agents, including vehicles and roadside units. The dataset covers three urban maps representing Tokyo regions (Aomi, Odaiba, and the Shutoko Expressway) under diverse environmental conditions, including clear daytime, rainy daytime, and clear nighttime scenarios.
All data are provided in the standardized OpenLABEL format with annotations for 12 object classes, totaling 140k images and 30k point clouds. We further evaluate RealSim-CP using CoopDet3D, a state-of-the-art multimodal cooperative 3D object detection framework, demonstrating the effectiveness of the dataset for advanced cooperative perception research. These results indicate that high-fidelity simulation can effectively bridge the gap between simulation and real-world deployment while significantly reducing data collection costs. RealSim-CP provides the first region-specific cooperative perception dataset tailored to Japanese traffic environments.
},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{Wu2026,
title = {A Synthetic Benchmark for Collaborative 3D Semantic Occupancy Prediction in V2X-Enabled Autonomous Driving },
author = {Hanlin Wu and Pengfei Lin and Ehsan Javanmardi and Naren Bao and Bo Qian and Hao Si and Manabu Tsukada},
url = {https://arxiv.org/abs/2506.17004
https://github.com/tlab-wide/Co3SOP},
year = {2026},
date = {2026-06-01},
urldate = {2026-06-01},
booktitle = {IEEE International Conference on Robotics & Automation (ICRA 2026)},
address = {Vienna, Austria},
abstract = {3D semantic occupancy prediction is an emerging perception paradigm in autonomous driving, providing a voxel-level representation of both geometric details and semantic categories. However, its effectiveness is inherently constrained in single-vehicle setups by occlusions, restricted sensor range, and narrow viewpoints. To address these limitations, collaborative perception enables the exchange of complementary information, thereby enhancing
the completeness and accuracy of predictions. Despite its potential, research on collaborative 3D semantic occupancy prediction is hindered by the lack of dedicated datasets. To bridge this gap, we design a high-resolution semantic voxel sensor in CARLA to produce dense and comprehensive annotations. We further develop a baseline model that performs inter-agent feature fusion via spatial alignment and attention aggregation. In addition, we establish
benchmarks with varying prediction ranges designed to systematically assess the impact of spatial extent on collaborative prediction. Experimental results demonstrate the superior performance of our baseline, with increasing gains observed as range expands. },
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{Chauhan2026b,
title = {Don't Worry, Just Follow Me: Prototyping and In-the-Wild Evaluation of Smart Pole Interaction Unit with Mobility},
author = {Vishal Chauhan and Anubhav Anubhav and Mark Colley and Chia-Ming Chang and Xinyue Gui and Ding Xia and Ehsan Javanmardi and Takeo Igarashi and Kantaro Fujiwara and Manabu Tsukada},
url = {https://www.researchgate.net/profile/Vishal-Chauhan-17/publication/401082338_Don\\\\\\\'t_Worry_Just_Follow_Me_Prototyping_and_In-the-Wild_Evaluation_of_Smart_Pole_Interaction_Unit_with_Mobility/links/699c56575d60ab483570b3d5/Dont-Worry-Just-Follow-Me-Prototyping-and-In-the-Wild-Evaluation-of-Smart-Pole-Interaction-Unit-with-Mobility.pdf},
doi = {10.1145/3772318.3790882},
year = {2026},
date = {2026-04-13},
urldate = {2026-04-13},
booktitle = {ACM CHI conference on Human Factors in Computing Systems 2026},
address = {Barcelona, Spain},
abstract = {Pedestrian–automated vehicle(AV) encounters in shared spaces often involve hesitation and ambiguity. Vehicle-mounted external human–machine interfaces(eHMIs) can help, but obscured or poorly timed communications create significant challenges. To address this, we present a mobile smart pole interaction unit(SPIU) with integrated cameras and LED displays, designed as a pedestrian-side system to deliver explicit cues(``WALK,'' ``STOP''). An in-the-wild evaluation of the SPIU(N=21) using a four-factor analysis (CarBehavior, Mobility, eHMI, SPIU) showed that the SPIU improved understandability, trust, and perceived safety, and reduced workload compared with the baseline, with a combination(eHMI+SPIU) yielding the strongest results. Beyond these quantitative benefits, participants appreciated the mobility of the SPIU for its ``clear'' and ``easy to decide'' mediation. This work contributes to(1) a design and deployment framework for a mobile SPIU and(2) an in-the-wild evaluation protocol for pedestrian–AV interactions in nonsignalized spaces. Our work sparks discussions on real world evaluations involving detailed vehicle kinematics and accessible multimodality(e.g., audio), focusing on the role of personal robots as user-side eHMIs.},
note = {Honourable Mention Award},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
We are part of the University of Tokyo’s Graduate School of Information Science and Technology, Department of Creative Informatics and focuses on computer networks and cyber-physical systems
Address
4F, I-REF building, Graduate School of Information Science and Technology, The University of Tokyo, 1-1-1, Yayoi, Bunkyo-ku, Tokyo, 113-8657 Japan
Room 91B1, Bld 2 of Engineering Department, The University of Tokyo, 7-3-1 Hongo, Bunkyo-ku, Tokyo 113-8656, Japan
Mail: