research.html

<!DOCTYPE HTML>
<html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  <title>Weidi Xie</title>
  
  <meta name="author" content="Weidi Xie">
  <meta name="viewport" content="width=device-width", initial-scale="1">
  
  <link rel="stylesheet" type="text/css" href="stylesheet.css">
  <link rel="icon" type="image/svg+xml" href="images/icon.svg">
</head>

<body>
  <table id="container">
    <tr>
      <td>
        <table width="100%" align="center" border="0" cellspacing="0" cellpadding="20">
          <tr>
            <p align="center">
              <a href=index.html>Home</a>&nbsp/&nbsp
              <a href=about.html>About&nbspMe</a>&nbsp/&nbsp
              <a href=team.html>Team</a>&nbsp/&nbsp
              <a href=people.html>Collaborators</a>&nbsp/&nbsp
              <a href=research.html>Research</a>
            </p>
            <hr>
          </tr>
        </table>

        <table width="130%" align="center" border="0" cellspacing="0" cellpadding="20">
            <tr>
              <td>
              </ol>
              <h2>2025</h2>
              <h3>AI4Medicine</h3>
              <ol>

                <li style="border: 2px solid #000000; padding: 10px; border-radius: 1px; background-color: #f0f0f0; max-width: 700px;">
                  <a href="https://arxiv.org/abs/2408.12547">
                  <papertitle> Towards Evaluating and Building Versatile Large Language Models for Medicine.</papertitle> 
                  </a>
                  <br>
                  Chaoyi Wu, Pengcheng Qiu, Jinxin Liu, Hongfei Gu, Na Li, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> 
                  <br>
                  In: <em>Npj Digital Medicine (Nature Portfolio, in press)</em>, 2025. (5-Year Impact Factor: ~15.2) &nbsp<font color="red"><strong>(New)</strong></font> <br>
                  <a href="https://arxiv.org/abs/2408.12547">Arxiv</a> |
                  <a href="https://github.com/MAGIC-AI4Med/MedS-Ins">Code</a>| 
                  <a href="https://henrychur.github.io/MedS-Bench/">Leaderboard</a> 
              </li>
              <p> </p>

              <li style="border: 2px solid #000000; padding: 10px; border-radius: 1px; background-color: #f0f0f0; max-width: 700px;">
                <a href="https://arxiv.org/abs/2304.14454">
                <papertitle> PMC-VQA: Visual Instruction Tuning for Medical Visual Question Answering. </papertitle> 
                  </a>
                  <br>
                  Xiaoman Zhang, Chaoyi Wu, Ziheng Zhao, Weixiong Lin, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong>
                  <br>
                  In: <em> Nature Communications Medicine (in press)</em>, 2025. (2-Year Impact Factor: ~5.4)&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                  <a href="https://xiaoman-zhang.github.io/PMC-VQA/">Project Page</a>  |
                  <a href="https://arxiv.org/pdf/2305.10415">Arxiv</a> </li>
                <p> </p>

                <li>
                  <a href="https://arxiv.org/abs/2412.13126">
                    <papertitle>A Knowledge-enhanced Pathology Vision-language Foundation Model for Cancer Diagnosis</papertitle> 
                    </a>
                    <br>
                    Xiao Zhou, Luoyi Sun, Dexuan He, Wenbin Guan, Ruifen Wang, Lifeng Wang, Xin Sun, Kun Sun, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong>
                    <br>
                    <em>Under Review</em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://arxiv.org/abs/2412.13126">Arxiv</a> |
                    <a href="">Project Page</a>
                  </li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/2412.09529">
                    <papertitle>Can Modern LLMs Act as Agent Cores in Radiology Environments?</papertitle> 
                    </a>
                    <br>
                    Qiaoyu Zheng, Chaoyi Wu, Pengcheng Qiu, Lisong Dai, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong>
                    <br>
                    <em>Under Review</em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://arxiv.org/abs/2412.09529">Arxiv</a> |
                    <a href="">Project Page</a>
                  </li>
                  <p> </p>

                <li>
                  <a href="hhttps://arxiv.org/abs/2412.04106">
                    <papertitle>MRGen: Diffusion-based Controllable Data Engine for MRI Segmentation towards Unannotated Modalities.</papertitle> 
                    </a>
                    <br>
                    Haoning Wu, Ziheng Zhao, Ya Zhang, <strong>Weidi Xie</strong>, Yanfeng Wang
                    <br>
                    <em>Under Review</em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://arxiv.org/abs/2412.04106">Arxiv</a> |
                    <a href="https://haoningwu3639.github.io/MRGen/">Project Page</a>
                  </li>
                  <p> </p>

              </ol>
              <h3>Computer Vision</h3>
              <ol>
                <li>
                  <a href="">
                    <papertitle>A Sanity Check for AI-generated Image Detection.</papertitle> 
                    </a>
                    <br>
                    Shilin Yan, Ouxiang Li, Jiayin Cai, Yanbin Hao, Xiaolong Jiang, Yao Hu, <strong>Weidi Xie</strong>
                    <br>
                    In: <em> The Thirteenth International Conference on Learning Representations (ICLR) </em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="">Arxiv</a> |
                    <a href="">Project Page</a> |
                    <a href="">公众号介绍</a> 
                  </li>
                  <p> </p>
                
                <li>
                  <a href="https://kuis-ai.github.io/track_on">
                    <papertitle>Track-On: Transformer-based Online Point Tracking with Memory.</papertitle> 
                    </a>
                    <br>
                    Görkay Aydemir, Xiongyi Cai, <strong>Weidi Xie</strong>, Fatma Guney
                    <br>
                    In: <em> The Thirteenth International Conference on Learning Representations (ICLR) </em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://arxiv.org/pdf/2501.18487">Arxiv</a> |
                    <a href="https://kuis-ai.github.io/track_on">Project Page</a> |
                    <a href="https://mp.weixin.qq.com/s/ccddIWLvF0Vya7tNGv2L9A">公众号介绍</a> 
                  </li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/abs/2408.14469">
                    <papertitle>Grounded Multi-Hop VideoQA in Long-Form Egocentric Videos.</papertitle> 
                    </a>
                    <br>
                    Qirui Chen, Shangzhe Di, <strong>Weidi Xie</strong>
                    <br>
                    In: <em>  Thirty-Ninth AAAI Conference on Artificial Intelligence (AAAI) </em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://arxiv.org/abs/2408.14469">Arxiv</a> |
                    <a href="https://qirui-chen.github.io/MultiHop-EgoQA">Project Page</a> |
                    <a href="https://mp.weixin.qq.com/s/VW20R6-hC5WpwwptHYJBuQ">公众号介绍</a> 
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2412.01820">
                      <papertitle>Towards Universal Soccer Video Understanding.</papertitle> 
                      </a>
                      <br>
                      Jiayuan Rao, Haoning Wu, Hao Jiang, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong>
                      <br>
                      <em>Under Review</em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                      <a href="https://arxiv.org/abs/2412.01820">Arxiv</a> |
                      <a href="https://jyrao.github.io/UniSoccer/">Project Page</a>
                    </li>
                    <p> </p>

                    <li>
                      <a href="https://arxiv.org/abs/2412.01694">
                        <papertitle>Unlocking Video-LLM via Agent-of-Thoughts Distillation.</papertitle> 
                        </a>
                        <br>
                        Yudi Shi, Shangzhe Di, Qirui Chen, <strong>Weidi Xie</strong>
                        <br>
                        <em>Under Review</em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                        <a href="https://arxiv.org/abs/2412.01694">Arxiv</a> |
                        <a href="https://zhengrongz.github.io/AoTD/">Project Page</a>
                      </li>
                      <p> </p>
  
                      <li>
                        <a href="https://arxiv.org/abs/2412.01720">
                          <papertitle>LamRA: Large Multimodal Model as Your Advanced Retrieval Assistant.</papertitle> 
                          </a>
                          <br>
                          Yikun Liu, Pingan Chen, Jiayin Cai, Xiaolong Jiang, Yao Hu, Jiangchao Yao, Yanfeng Wang, <strong>Weidi Xie</strong>
                          <br>
                          <em>Under Review</em>, 2025.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                          <a href="https://arxiv.org/abs/2412.01720">Arxiv</a> |
                          <a href="https://code-kunkun.github.io/LamRA/">Project Page</a>
                        </li>
                        <p> </p>

              </ol>
              <h2>2024</h2>
              <h3>AI4Medicine</h3>
              <ol>
                <li style="border: 2px solid #000000; padding: 10px; border-radius: 1px; background-color: #f0f0f0; max-width: 700px;">
                  <a href="https://arxiv.org/abs/2304.14454">
                  <papertitle> PMC-LLaMA: Towards Building Open-source Language Models for Medicine. </papertitle> 
                    </a>
                    <br>
                    Chaoyi Wu*, Weixiong Lin*, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong>
                    <br>
                    In: <em>Journal of the American Medical Informatics Association</em>, 2024. (JAMIA, Impact Factor: ~7.9) <br>
                    <a href="https://arxiv.org/abs/2304.14454">Arxiv</a> |
                    <a href="https://huggingface.co/chaoyi-wu/PMC_LLAMA_7B">Model</a> |
                    <a href="https://github.com/chaoyi-wu/PMC-LLaMA">Code</a>
                  </li>
                  <p> </p>
                

                  <li style="border: 2px solid #000000; padding: 10px; border-radius: 1px; background-color: #f0f0f0; max-width: 700px;">
                    <a href="https://www.nature.com/articles/s41467-024-52417-z">
                    <papertitle> Towards Building Multilingual Language Model for Medicine.</papertitle> 
                    </a>
                    <br>
                    Pengcheng Qiu*, Chaoyi Wu*, Xiaoman Zhang, Weixiong Lin, Haicheng Wang, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> 
                    <br>
                    In: <em> Nature Communications</em>, 2024. (5-Year Impact Factor: ~16.1)&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://www.nature.com/articles/s41467-024-52417-z">Paper</a> |
                    <a href="https://github.com/MAGIC-AI4Med/MMedLM">Code</a>| 
                    <a href="https://github.com/MAGIC-AI4Med/MMedLM">Model</a>| 
                    <a href="https://github.com/MAGIC-AI4Med/MMedLM">Dataset</a> 
                  </li>
                  <p> </p>


                  <li style="border: 2px solid #000000; padding: 10px; border-radius: 1px; background-color: #f0f0f0; max-width: 700px;">
                    <a href="https://www.nature.com/articles/s41467-024-54424-6">
                      <papertitle>Large-scale Long-tailed Disease Diagnosis on Radiology Images.</papertitle>
                    </a>
                    <br>
                    Qiaoyu Zheng, Weike Zhao, Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> 
                    <br>
                    In: <em> Nature Communications</em>, 2024. (5-Year Impact Factor: ~16.1)&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://www.nature.com/articles/s41467-024-54424-6">Paper</a> |
                    <a href="https://qiaoyu-zheng.github.io/RP3D-Diag">Project Page</a>  
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://www.medrxiv.org/content/10.1101/2024.06.24.24309405v1">
                      <papertitle>RaTEScore: A Metric for Radiology Report Generation.</papertitle> 
                      </a>
                      <br>
                      Weike Zhao, Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong>
                      <br>
                      In: <em> Empirical Methods in Natural Language Processing (EMNLP)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                      <a href="https://www.medrxiv.org/content/10.1101/2024.06.24.24309405v1">medRxiv</a> |
                      <a href="https://github.com/MAGIC-AI4Med/RaTEScore">Code</a> |
                      <a href="https://angelakeke.github.io/RaTEScore/">Project Page</a> 
                    </li>
                    <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2404.09942">
                    <papertitle> Knowledge-enhanced Visual-Language Pretraining for Computational Pathology.</papertitle> 
                    </a>
                    <br>
                    Xiao Zhou, Xiaoman Zhang, Chaoyi Wu, Ya Zhang, <strong>Weidi Xie</strong>, Yanfeng Wang
                    <br>
                    In: <em> European Conference on Computer Vision (ECCV)</em>, 2024.&nbsp<font color="red"><strong>(Oral Presentation)</strong></font> <br> 
                    <a href="https://arxiv.org/abs/2404.09942">Arxiv</a> |
                    <a href="https://github.com/MAGIC-AI4Med/KEP">Code</a> |
                    <a href="https://mp.weixin.qq.com/s?__biz=MzkyNjcyNTQ1MQ==&mid=2247483850&idx=1&sn=8484ae383e53525a9e5d371283b5349c&chksm=c233a5eef5442cf82cda84f9d67003fc50ec36dda853c22f146479f6d81c4e6e262e86b70fb3&token=794637245&lang=zh_CN#rd">公众号介绍</a> 
                  </li>
                  <p> </p>

                <li>
                    <a href="https://www.sciencedirect.com/science/article/pii/S1361841524000720">
                      <papertitle>Sensorless Volumetric Reconstruction of Fetal Brain Freehand Ultrasound Scans with Deep Implicit Representation.</papertitle> 
                    </a>
                    <br>
                    Pak-Hei Yeung, Linde S. Hesse, Moska Aliasi, Monique C. Haak, INTERGROWTH-21st Consortium, <strong>Weidi Xie</strong>, Ana I.L. Namburete 
                    <br> In: <em>Medical Image Analysis, 2024. </em>(Impact Factor: ~11)<br> 
                  </li>
                  <p> </p>

                <li>
                    <a href="https://arxiv.org/abs/2407.16684">
                      <papertitle>AutoRG-Brain: Grounded Report Generation for Brain MRI.</papertitle> 
                      </a>
                      <br>
                      Jiayu Lei, Xiaoman Zhang, Chaoyi Wu, Lisong Dai, Ya Zhang, Yanyong Zhang, Yanfeng Wang, <strong>Weidi Xie</strong>, Yuehua Li
                      <br>
                      In <em> Submission.</em>&nbsp<font color="red"><strong>(New)</strong></font><br>
                      <a href="https://arxiv.org/abs/2407.16684">Arxiv</a> |
                      <a href="">Model</a> |
                      <a href="">Code</a>
                    </li>
                    <p> </p>

                <li>
                    <a href="https://arxiv.org/abs/2404.16754">
                      <papertitle>RadGenome-Chest CT: A Grounded Vision-Language Dataset for Chest CT Analysis.</papertitle> 
                      </a>
                      <br>
                      Xiaoman Zhang, Chaoyi Wu, Ziheng Zhao, Jiayu Lei, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong>
                      <br>
                      In <em>Submission.</em> &nbsp<font color="red"><strong>(New)</strong></font> <br>
                      <a href="https://arxiv.org/abs/2404.16754">Arxiv</a> |
                      <a href="">Model</a> |
                      <a href="">Code</a>
                    </li>
                    <p> </p>

                  <li>
                    <a href="https://chaoyi-wu.github.io/RadFM/">
                    <papertitle>Towards Generalist Foundation Model for Radiology by Leveraging Web-scale 2D & 3D Medical Data.</papertitle>
                    </a>
                    <br>
                    Chaoyi Wu*, Xiaoman Zhang*, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> <br>
                    <em> Under Review for Nature Communications</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://chaoyi-wu.github.io/RadFM/">Project Page</a>  |
                    <a href="https://github.com/chaoyi-wu/RadFM">Code & Model</a>  |
                    <a href="https://arxiv.org/abs/2308.02463">Arxiv</a></li>
                    <p> </p>


                  <li>
                    <a href="https://zhaoziheng.github.io/SAT/">
                    <papertitle>One Model to Rule them All: Towards Universal Segmentation for Medical Images with Text Prompts.</papertitle>
                    </a>
                    <br>
                    Ziheng Zhao, Yao Zhang, Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> 
                    <br>
                    <em> Under Review for Nature Communications</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://zhaoziheng.github.io/SAT/">Project Page</a>  |
                    <a href="https://arxiv.org/abs/2312.17183">Arxiv</a>
                  </li>
                  <p> </p>

                </ol>
                <h3>Computer Vision</h3>
                <ol>

                <li>
                  <a href="https://arxiv.org/abs/2309.11500">
                  <papertitle>A General Protocol to Probe Large Vision Models for 3D Physical Understanding</papertitle>
                  </a>
                  <br>
                  Guanqi Zhan, Chuanxia Zheng, <strong>Weidi Xie</strong>, Andrew Zisserman<br>
                  In: <em>Conference on Neural Information Processing Systems (NeurIPS) </em>, 2024. <br> 
                  <a href="">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2310.06836">Arxiv</a> </li>
                  <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2406.18530">
                      <papertitle>MatchTime: Towards Automatic Soccer Game Commentary Generation.</papertitle> 
                      </a>
                      <br>
                      Jiayuan Rao, Haoning Wu, Chang Liu, Yanfeng Wang, <strong>Weidi Xie</strong>
                      <br>
                      In: <em> Empirical Methods in Natural Language Processing (EMNLP)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                      <a href="https://arxiv.org/abs/2406.18530">Arxiv</a> |
                      <a href="https://haoningwu3639.github.io/MatchTime/">Project Page</a> |
                      <a href="https://mp.weixin.qq.com/s?__biz=MzkyNjcyNTQ1MQ==&mid=2247483856&idx=1&sn=33854777c861b435e431655e9238daf0&chksm=c233a5f4f5442ce205035e945c51cb7981582ccfc6892d1aa4869cba0636931ebf7d8b182df3&token=794637245&lang=zh_CN#rd">公众号介绍</a> 
                    </li>
                    <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2407.12735">
                      <papertitle>EchoSight: Advancing Visual-Language Models with Wiki Knowledge.</papertitle> 
                      </a>
                      <br>
                      Yibin Yan, <strong>Weidi Xie</strong>
                      <br>
                      In: <em> Empirical Methods in Natural Language Processing (EMNLP)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                      <a href="https://arxiv.org/abs/2407.12735">Arxiv</a> |
                      <a href="https://github.com/Go2Heart/EchoSight">Code</a> |
                      <a href="https://go2heart.github.io/echosight/">Project Page</a> 
                    </li>
                    <p> </p>
                    

                  <li>
                    <a href="https://arxiv.org/abs/2407.15850">
                      <papertitle>AutoAD-Zero: A Training-Free Framework for Zero-Shot Audio Description.</papertitle> 
                      </a>
                      <br>
                      Junyu Xie, Tengda Han, Max Bain, Arsha Nagrani, Gül Varol, <strong>Weidi Xie</strong>, Andrew Zisserman
                      <br>
                      In: <em> Asian Conference on Computer Vision (ACCV)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                      <a href="https://arxiv.org/abs/2407.15850">Arxiv</a> |
                      <a href="https://www.robots.ox.ac.uk/~vgg/research/autoad-zero/">Project Page</a> 
                    </li>
                    <p> </p>
                  
                  <li>
                    <a href="https://arxiv.org/abs/2404.12389">
                      <papertitle>Moving Object Segmentation: All You Need Is SAM (and Flow).</papertitle> 
                      </a>
                      <br>
                      Junyu Xie, Charig Yang, <strong>Weidi Xie</strong>, Andrew Zisserman
                      <br>
                      In: <em> Asian Conference on Computer Vision (ACCV)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                      <a href="https://arxiv.org/abs/2404.12389">Arxiv</a> |
                      <a href="https://www.robots.ox.ac.uk/~vgg/research/flowsam/">Project Page</a> 
                    </li>
                    <p> </p>
                  
                  <li>
                    <a href="https://arxiv.org/abs/2309.11500">
                    <papertitle>A Large-scale Dataset for Audio-Language Representation Learning.</papertitle>
                    </a>
                    <br>
                    Luoyi Sun, Xuenan Xu, Mengyue Wu, <strong>Weidi Xie</strong><br>
                    In: <em>ACM Multimedia</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://auto-acd.github.io">Project Page</a>  |
                    <a href="https://arxiv.org/abs/2309.11500">Arxiv</a> </li>
                    <p> </p>

                  <li>
                    <a href="https://lzq5.github.io/Video-Text-Alignment/">
                      <papertitle>Multi-Sentence Grounding for Long-term Instructional Video.</papertitle>
                    </a>
                    <br>
                    Zeqian Li*, Qirui Chen*, Tengda Han, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong><br>
                    In: <em> European Conference on Computer Vision (ECCV)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="https://lzq5.github.io/Video-Text-Alignment/">Project Page</a>  |
                    <a href="https://arxiv.org/abs/2312.14055">Paper</a>
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2312.11463">
                      <papertitle>Appearance-based Refinement for Object-Centric Motion Segmentation.</papertitle>
                    </a>
                    <br>
                    Junyu Xie, <strong>Weidi Xie</strong>, Andrew Zisserman <br>
                    In: <em> European Conference on Computer Vision (ECCV)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                    <a href="">Project Page</a>  |
                    <a href="https://arxiv.org/abs/2312.11463">Paper</a></li>
                    <p> </p>

                  <li>
                      <a href="https://arxiv.org/abs/2407.11325">
                        <papertitle>VISA: Reasoning Video Object Segmentation via Large Language Model.</papertitle>
                      </a>
                      <br>
                      Cilin Yan, Haochen Wang, Shilin Yan, Xiaolong Jiang, Yao Hu, Guolaing Kang, <strong>Weidi Xie</strong>, Efstratios Gavves <br>
                      In: <em> European Conference on Computer Vision (ECCV)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                      <a href="https://github.com/cilinyan/VISA">Code & Model</a>  |
                      <a href="https://arxiv.org/abs/2407.11325">Paper</a></li>
                      <p> </p>

                  <li>
                      <a href="https://arxiv.org/abs/2404.16828">
                        <papertitle>Made to Order: Discovering Monotonic Temporal Changes via Self-supervised Video Ordering.</papertitle>
                      </a>
                      <br>
                      Charig Yang, <strong>Weidi Xie</strong>, Andrew Zisserman<br>
                      In: <em> European Conference on Computer Vision (ECCV)</em>, 2024.&nbsp<font color="red"><strong>(New)</strong></font> <br> 
                      <a href="https://charigyang.github.io/order/">Project Page</a>  |
                      <a href="https://arxiv.org/abs/2404.16828">Paper</a></li>
                      <p> </p>

                  <li>
                    <a href="https://haoningwu3639.github.io/StoryGen_Webpage/">
                      <papertitle>Intelligent Grimm - Open-ended Visual Storytelling via Latent Diffusion Models.</papertitle>
                    </a>
                    <br>
                    Chang Liu*, Haoning Wu*, Yujie Zhong, Xiaoyun Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2024. &nbsp <font color="red"><strong>(New)</strong></font> <br>
                    <a href="https://haoningwu3639.github.io/StoryGen_Webpage/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2306.00973">Arxiv</a></li>
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/autoad/">
                      <papertitle>AutoAD III: The Prequel -- Back to the Pixels.</papertitle>
                    </a>
                    <br>
                    Tengda Han, Max Bain, Arsha Nagrani, Gül Varol, <strong>Weidi Xie</strong>, Andrew Zisserman <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2024. &nbsp <font color="red"><strong>(New)</strong></font> <br>
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/autoad/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2404.14412">Paper</a></li>
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://fcjian.github.io/InstaGen/">
                      <papertitle>InstaGen: Enhancing Object Detection by Training on Synthetic Dataset.</papertitle>
                    </a>
                    <br>
                    Chengjian Feng, Yujie Zhong, Zequn Jie^&dagger;, <strong>Weidi Xie^&dagger;</strong>, Lin Ma<br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2024. &nbsp <font color="red"><strong>(New)</strong></font> <br>
                    <a href="https://fcjian.github.io/InstaGen/">Project Page</a>  |
                    <a href="https://arxiv.org/abs/2402.05937">Paper</a>
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2401.00789">
                      <papertitle>Retrieval-Augmented Egocentric Video Captioning.</papertitle>
                    </a>
                    <br>
                    Jilan Xu, Yifei Huang, Junlin Hou, Guo Chen, Yuejie Zhang, Rui Feng, <strong>Weidi Xie</strong><br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2024. &nbsp <font color="red"><strong>(New)</strong></font> <br>
                  <a href="">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2401.00789">Paper</a>
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2312.06505">
                      <papertitle>Grounded Question-Answering in Long Egocentric Videos.</papertitle>
                    </a>
                    <br>
                    Shangzhe Di, <strong>Weidi Xie</strong> <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2024. &nbsp <font color="red"><strong>(New)</strong></font> <br>
                    <a href="https://dszdsz.cn/GroundVQA/index.html">Project Page</a>  |
                    <a href="https://arxiv.org/abs/2312.06505">Paper</a>
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/amodal/">
                      <papertitle>Amodal Ground Truth and Completion in the Wild.</papertitle>
                    </a>
                    <br>
                    Guanqi Zhan, Chuanxia Zheng, <strong>Weidi Xie</strong>, Andrew Zisserman <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2024. &nbsp <font color="red"><strong>(New)</strong></font> <br>
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/amodal/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2312.17247">Paper</a></li>
                  </li>
                  <p> </p>

                  <li style="border: 2px solid #000000; padding: 10px; border-radius: 1px; background-color: #f0f0f0; max-width: 700px;">
                    <a href="https://link.springer.com/article/10.1007/s11263-024-02144-1">
                      <span style="font-weight: bold; color: #007bff;">OV-DAR: Open-vocabulary Object Detection and Attributes Recognition.</span>
                    </a>
                    <br>
                    Keyan Chen*, Xiaolong Jiang*, Yao Hu, Xu Tang, Yan Gao, Jianqi Chen, <strong>Weidi Xie</strong> 
                    <br>
                    In: <em> International Journal of Computer Vision</em>, 2024.  (IJCV, Impact Factor: ~19.5, Corr Author)  &nbsp <font color="red"><strong>(New)</strong></font><br> 
                    <a href="https://kyanchen.github.io/OvarNet/">Project Page</a>  |
                    <a href="https://link.springer.com/article/10.1007/s11263-024-02144-1">Journal Version</a>
                  </li>
                  <p> </p>

                  <li style="border: 2px solid #000000; padding: 10px; border-radius: 1px; background-color: #f0f0f0; max-width: 700px;">
                    <a href="https://link.springer.com/article/10.1007/s11263-024-02076-w#citeas">
                      <span style="font-weight: bold; color: #007bff;">OV-VIS: Open-Vocabulary Video Instance Segmentation.</span>
                    </a>
                    <br>
                    Haochen Wang, Shuai Wang, Cilin Yan, Xiaolong Jiang, Xu Tang, Yao Hu, <strong>Weidi Xie^&dagger;</strong>, Efstratios Gavves<br>
                    In: <em> International Journal of Computer Vision</em>, 2024.  (IJCV, Impact Factor: ~19.5, Corr Author)  &nbsp <font color="red"><strong>(New)</strong></font><br> 
                    <a href="https://github.com/haochenheheda/LVVIS">Code</a>  |
                    <a href="https://trebuchet.public.springernature.app/get_content/400e4331-62c2-46a7-a7f9-cc37b71f3963?utm_source=rct_congratemailt&utm_medium=email&utm_campaign=nonoa_20240531&utm_content=10.1007/s11263-024-02076-w">Journal Version</a>
                  </li>
                  <p> </p>

              </ol>
              <h2>2023</h2>
              <h3>AI4Medicine</h3>
              <ol>

                <li>
                  <a href="data/GPT_4V_evaluation_medical.pdf">
                  <papertitle>Can GPT-4V(ision) Serve Medical Applications ? Case Studies on GPT-4V for Multimodal Medical Diagnosis.</papertitle>
                  </a>
                  <br>
                  Chaoyi Wu*, Jiayu Lei*, Qiaoyu Zheng*, Weike Zhao*, Weixiong Lin*, Xiaoman Zhang*, Xiao Zhou*, Ziheng Zhao*,  <br> 
                  Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> <br>

                  <em> Technical Report</em>, 2023. <br> 
                  <a href="https://chaoyi-wu.github.io/RadFM/">Project Page</a>  |
                  <a href="data/GPT_4V_evaluation_medical.pdf">Paper</a></li>
                  <p> </p>
  
                  <li style="border: 2px solid #000000; padding: 10px; border-radius: 1px; background-color: #f0f0f0; max-width: 700px;">
                    <a href="https://arxiv.org/abs/2302.14042">
                      <span style="font-weight: bold; color: #007bff;">Knowledge-enhanced Pre-training for Auto-diagnosis of Chest Radiology Images.</span>
                    </a>
                    <br>
                    Xiaoman Zhang, Chaoyi Wu, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> <br>
                    In: <em> Nature Communications</em>, 2023.  (5-Year Impact Factor: ~16.1) <br> 
                  <a href="https://xiaoman-zhang.github.io/KAD/">Project Page</a>  |
                  <a href="https://github.com/xiaoman-zhang/KAD">Code & Model</a>  |
                  <a href="https://www.nature.com/articles/s41467-023-40260-7">Paper</a>
                  </li>
                  <p> </p>
  
                <li>
                  <a href="https://www.medrxiv.org/content/10.1101/2023.01.10.23284412v1">
                  <papertitle>MedKLIP: Medical Knowledge Enhanced Language-Image Pre-Training.</papertitle>
                  </a>
                  <br>
                  Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> <br>
                  In: <em> International Conference on Computer Vision (ICCV)</em> , 2023. <br> 
                  <a href="https://chaoyi-wu.github.io/MedKLIP/">Project Page</a>  |
                  <a href="https://github.com/MediaBrain-SJTU/MedKLIP">Code & Model</a>  |
                  <a href="https://www.medrxiv.org/content/10.1101/2023.01.10.23284412v1.full.pdf">Arxiv</a></li>
                  <p> </p>
  
                <li>
                  <a href="https://arxiv.org/pdf/2303.07240">
                  <papertitle>PMC-CLIP: Contrastive Language-Image Pre-training using Biomedical Documents.</papertitle> 
                  </a>
                  <br>
                  Weixiong Lin*, Ziheng Zhao*, Xiaoman Zhang, Chaoyi Wu, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> <br>
                  In: <em> International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)</em>, 2023.<br>
                  <font color="red"><strong>MICCAI Young Scientist Publication Impact Award, Finalist</strong></font> <br>
                  <a href="https://weixionglin.github.io/PMC-CLIP/">Project Page</a>  |
                  <a href="https://github.com/WeixiongLin/PMC-CLIP/">Code & Model</a>  |
                  <a href="https://arxiv.org/pdf/2303.07240">Arxiv</a></li>
                  <p> </p>
             
                <li>
                  <a href="https://link.springer.com/chapter/10.1007/978-3-031-48593-0_10">
                  <papertitle>Deep Facial Phenotyping with Mixup Augmentation.</papertitle>
                  </a>
                  <br>
                  Jonathan Campbell, Mitchell Dawson, Andrew Zisserman, <strong>Weidi Xie</strong>, Christoffer Nellåker<br>
                  In: <em>Annual Conference on Medical Image Understanding and Analysis</em>. <br> 
                  <a href="https://link.springer.com/chapter/10.1007/978-3-031-48593-0_10">Paper</a></li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/abs/2302.11557">
                  <papertitle>K-Diag: Knowledge-enhanced Disease Diagnosis in Radiographic Imaging.</papertitle>
                  </a>
                  <br>
                  Chaoyi Wu*, Xiaoman Zhang*, Yanfeng Wang, Ya Zhang, <strong>Weidi Xie</strong> <br>
                  In: <em>Big Task Small Data, 1001-AI, MICCAI 2023 Workshop (Oral)</em>. <br> 
                  <a href="https://chaoyi-wu.github.io/K-Diag/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2302.11557">Arxiv</a></li>
                  <p> </p>

                <li>
                  <a href="https://ieeexplore.ieee.org/abstract/document/10032792/">
                  <papertitle>Self-supervised Tumor Segmentation with Sim2Real Adaptation.</papertitle>
                  </a>
                  <br>
                  Xiaoman Zhang, <strong>Weidi Xie</strong>, Chaoqin Huang, Ya Zhang, Xin Chen, Qi Tian, Yanfeng Wang <br>
                  In: <em>IEEE Journal of Biomedical and Health Informatics</em>, 2023. (Impact Factor: ~7)<br>
                  <a href="https://xiaoman-zhang.github.io/Layer-Decomposition/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2109.03230">Arxiv</a></li>
                  <p> </p>

              
              </ol>
              <h3>Computer Vision</h3>
              <ol>

                <li>
                  <a href="https://arxiv.org/abs/2310.06907">
                  <papertitle>Self-supervised Object-Centric Learning for Videos.</papertitle>
                  </a>
                  <br>
                  Görkay Aydemir, <strong>Weidi Xie</strong>, Fatma Güney<br>
                  In: <em>Conference on Neural Information Processing Systems (NeurIPS) </em>, 2023. <br> 
                  <a href="">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2310.06907">Arxiv</a> </li>
                  <p> </p>


                <li>
                  <a href="https://code-kunkun.github.io/ZS-CIR/">
                  <papertitle>Zero-shot Composed Text-Image Retrieval.</papertitle>
                  </a>
                  <br>
                  Yikun Liu, Jiangchao Yao, Yanfeng Wang, Ya Zhang, <strong>Weidi Xie</strong><br>
                  In: <em>British Machine Vision Conference (BMVC) </em>, 2023. <br> 
                  <a href="https://code-kunkun.github.io/ZS-CIR/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2306.07272">Arxiv</a> </li>
                  <p> </p>

                 <li>
                  <a href="https://haoningwu3639.github.io/VFI_Adapter_Webpage/">
                  <papertitle>Boost Video Frame Interpolation via Simple Motion Adaptation.</papertitle>
                  </a>
                  <br>
                  Haoning Wu, Xiaoyun Zhang, <strong>Weidi Xie</strong>, Ya Zhang, Yanfeng Wang <br>
                  In: <em>British Machine Vision Conference (BMVC) </em>, 2023. (Oral) <br> 
                  <a href="https://haoningwu3639.github.io/VFI_Adapter_Webpage/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2306.13933">Arxiv</a> </li>
                  <p> </p>

                <li>
                  <a href="https://jinxiang-liu.github.io/anno-free-AVS/">
                  <papertitle>Annotation-free Audio-Visual Segmentation.</papertitle>
                  </a>
                  <br>
                  Jinxiang Liu, Yu Wang, Chen Ju, Ya Zhang, <strong>Weidi Xie</strong><br>
                  In: <em>IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)</em>, 2023. <br> 
                  <a href="https://jinxiang-liu.github.io/anno-free-AVS/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2305.11019">Arxiv</a> </li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/abs/2301.05221">
                  <papertitle>Open-vocabulary Object Segmentation with Diffusion Models.</papertitle>
                  </a>
                  <br>
                  Ziyi Li*, Qinye Zhou*, Xiaoyun Zhang, Ya Zhang, Yanfeng Wang, <strong>Weidi Xie</strong> <br>
                  In: <em> International Conference on Computer Vision (ICCV)</em> , 2023. <br> 
                  <a href="https://lipurple.github.io/Grounded_Diffusion/">Project Page</a>  |
                  <a href="https://github.com/Lipurple/Grounded-Diffusion">Code & Model</a>  |
                  <a href="https://arxiv.org/abs/2301.05221">Arxiv</a></li>
                  <p> </p>

                 <li>
                  <a href="https://arxiv.org/abs/2310.06838">
                  <papertitle>AutoAD II: The Sequel – Who, When, and What in Movie Audio Description.</papertitle>
                  </a>
                  <br>
                  Tengda Han, Max Bain, Arsha Nagrani, Gül Varol, <strong>Weidi Xie</strong>, Andrew Zisserman <br>
                  In: <em> International Conference on Computer Vision (ICCV)</em> , 2023. <br> 
                  <a href="https://www.robots.ox.ac.uk/vgg/research/autoad/">Project Page</a>  |
                  <a href="https://www.robots.ox.ac.uk/~vgg/publications/2023/Han23a/han23a.pdf">Paper</a></li>
                  <p> </p>

                <li>
                  <a href="https://www.robots.ox.ac.uk/~vgg/publications/2023/Lamdouar23/lamdouar23.pdf">
                  <papertitle>The Making and Breaking of Camouflage.</papertitle>
                  </a>
                  <br>
                  Hala Lamdouar,  <strong>Weidi Xie</strong>, Andrew Zisserman<br>
                  In: <em> International Conference on Computer Vision (ICCV)</em> , 2023. <br> 
                  <a href="https://www.robots.ox.ac.uk/~vgg/publications/2023/Lamdouar23/lamdouar23.pdf">Paper</a></li>
                  <p> </p>
                
                <li>
                  <a href="https://arxiv.org/pdf/2304.01715">
                  <papertitle>Towards Open-Vocabulary Video Instance Segmentation.</papertitle>
                  </a>
                  <br>
                  Haochen Wang, Shuai Wang, Cilin Yan, Xiaolong Jiang, Xu Tang, Yao Hu, <strong>Weidi Xie*</strong>, Efstratios Gavves<br>
                  In: <em> International Conference on Computer Vision (ICCV)</em> , 2023. <br> 
                  <a href="">Project Page</a>  |
                  <a href="https://arxiv.org/pdf/2304.01715">Arxiv</a></li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/abs/2308.04808">
                  <papertitle>Joint-Relation Transformer for Multi-person Motion Prediction.</papertitle>
                  </a>
                  <br>
                  Qingyao Xu, Weibo Mao, Jingze Gong, Chenxin Xu, Siheng Chen, <strong>Weidi Xie</strong>, Ya Zhang, Yanfeng Wang<br>
                  In: <em> International Conference on Computer Vision (ICCV)</em> , 2023. <br> 
                  <a href="https://arxiv.org/abs/2308.04808">Arxiv</a></li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/abs/2306.05493">
                  <papertitle>Multi-Modal Classifiers for Open-Vocabulary Object Detection.</papertitle>
                  </a>
                  <br>
                  Prannay Kaul, <strong>Weidi Xie</strong>, Andrew Zisserman
                  <br>
                  In: <em>International Conference on Machine Learning (ICML) </em>, 2023. <br> 
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/mm-ovod/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2306.05493">Arxiv</a></li>
                  <p> </p>
                
                <li>
                  <a href="https://arxiv.org/abs/2308.08529">
                  <papertitle>Diagnosing Human-object Interaction Detectors.</papertitle>
                  </a>
                  <br>
                  Fanrui Zhu,  Fangrui Zhu, Yiming Xie, <strong>Weidi Xie</strong>, Huaizu Jiang <br>
                  <em>Technical Report</em>, 2023.  <br> 
                  <a href="https://github.com/neu-vi/Diag-HOI">Code</a>  |
                  <a href="https://arxiv.org/abs/2308.08529">Arxiv</a> </li>
                  <p> </p>

                <li>
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/arxiveri">
                  <papertitle>arXiVeri: Automatic Table Verification with GPT.</papertitle>
                  </a>
                  <br>
                  Gyungin Shin, <strong>Weidi Xie</strong>, Samuel Albanie
                  <br>
                  <em>Technical Report</em>, 2023. 
                  <br>
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/arxiveri">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2306.07968">Arxiv</a> </li>
                  <p> </p>

                <li>
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/namedmask/">
                  <papertitle>Namedmask: Distilling Segmenters from Complementary Foundation Models.</papertitle>
                  </a>
                  <br>
                  Gyungin Shin, <strong>Weidi Xie</strong>, Samuel Albanie, 
                  <br>
                  In: <em>CVPR Workshop </em>, 2023. 
                  <br>
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/namedmask/">Project Page</a>  |
                  <a href="https://arxiv.org/pdf/2209.11228.pdf">Arxiv</a></li>
                  <p> </p>

                <li>
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/zutis">
                  <papertitle>Zero-shot Unsupervised Transfer Instance Segmentation.</papertitle>
                  </a>
                  <br>
                  Gyungin Shin, Samuel Albanie, <strong>Weidi Xie</strong>
                  <br>
                  In: <em>CVPR Workshop </em>, 2023. &nbsp <font color="red"><strong>(Best Paper Award)</strong></font>
                  <br>
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/zutis">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2304.14376">Arxiv</a></li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/abs/2303.16899">
                  <papertitle>AutoAD: Movie Description in Context.</papertitle>
                  </a>
                  <br>
                  Tengda Han, Max Bain, Arsha Nagrani, Gül Varol, <strong>Weidi Xie</strong>, Andrew Zisserman
                  <br>
                  In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2023. &nbsp <font color="red"><strong>(Highlight)</strong></font> 
                  <br>
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/autoad/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2303.16899">Arxiv</a></li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/pdf/2303.13560">
                  <papertitle>Collaboration Helps Camera Overtake LiDAR in 3D Detection.</papertitle>
                  </a>
                  <br>
                  Yue Hu, Yifan Lu, Runsheng Xu, <strong>Weidi Xie</strong>, Siheng Chen, Yanfeng Wang
                  <br>
                  In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2023.
                  <br>
                  <a href="https://arxiv.org/pdf/2303.13560">Arxiv</a> |
                  <a href="https://siheng-chen.github.io/dataset/CoPerception+/">Dataset</a> |
                  <a href="https://github.com/MediaBrain-SJTU/CoCa3D">Code</a> </li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/abs/2301.09506">
                  <papertitle>OvarNet: Towards Open-vocabulary Object Attribute Recognition.</papertitle>
                  </a>
                  <br>
                  Keyan Chen*, Xiaolong Jiang*, Yao Hu, Xu Tang, Yan Gao, Jianqi Chen, <strong>Weidi Xie</strong> 
                  <br>
                  In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2023.
                  <br>
                  <a href="https://kyanchen.github.io/OvarNet/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2301.09506">Arxiv</a></li>
                  <p> </p>
                    
                <li>
                  <a href="https://arxiv.org/abs/2301.09121">
                  <papertitle>Learning Open-vocabulary Semantic Segmentation Models From Natural Language Supervision.</papertitle>
                  </a>
                  <br>
                  Jilan Xu, Junlin Hou, Yuejie Zhang, Rui Feng, Yi Wang, Yu Qiao, <strong>Weidi Xie</strong> 
                  <br>
                  In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2023.
                  <br>
                  <a href="https://jazzcharles.github.io/OVSegmentor/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2301.09121">Arxiv</a></li>
                  <p> </p>

                <li>
                  <a href="https://arxiv.org/pdf/2303.11732">
                  <papertitle>Multi-modal Prompting for Low-Shot Temporal Action Localization.</papertitle>
                  </a>
                  <br>
                  Chen Ju, Zeqian Li, Peisen Zhao, Ya Zhang, Xiaopeng Zhang, Qi Tian, Yanfeng Wang, <strong>Weidi Xie</strong> <br>
                  <em>Technical Report</em>, 2023. <br> 
                  <a href="https://arxiv.org/pdf/2303.11732">Arxiv</a></li>
                  <p> </p>
                  
                <li>
                  <a href="https://ieeexplore.ieee.org/abstract/document/10032792/">
                  <papertitle>Aerial Monocular 3d Object Detection.</papertitle>
                  </a>
                  <br>
                  Yue Hu, Shaoheng Fang, <strong>Weidi Xie</strong>, Siheng Chen <br>
                  In: <em>IEEE Robotics and Automation Letters (RA-L)</em>, 2023. (Impact Factor: ~4)<br>
                  <a href="https://xiaoman-zhang.github.io/Layer-Decomposition/">Project Page</a>  |
                  <a href="https://arxiv.org/abs/2208.03974">Arxiv</a></li>
                  <p> </p>

                </ol>
                <h2>2022</h2>
                <ol>
                  <li>
                    <a href="https://arxiv.org/abs/2210.04889">
                    <papertitle>Turbo Training with Token Dropout.</papertitle>
                    </a>
                    <br>
                    Tengda Han, <strong>Weidi Xie</strong>, Andrew Zisserman
                    <br> 
                    In: <em>British Machine Vision Conference (BMVC) </em>, 2022. <br>
                    <a href="">Project Page</a> |
                    <a href="https://arxiv.org/abs/2210.04889">Arxiv</a>
                    </li>
                <p> </p>   

                <li>
                  <a href="https://arxiv.org/abs/2208.13721">
                  <papertitle>A Simple Plugin for Transforming Images to Arbitrary Scales.</papertitle>
                  </a>
                  <br>
                  Qinye Zhou, Ziyi Li, <strong>Weidi Xie^&dagger;</strong>, Xiaoyun Zhang, Ya Zhang, Yanfeng Wang&dagger;
                  <br> 
                  In: <em>British Machine Vision Conference (BMVC) </em>, 2022. <br>
                  <a href="https://lipurple.github.io/ARIS_Webpage/">Project Page</a> |
                  <a href="https://arxiv.org/abs/2210.03417">Arxiv</a>
                  </li>
              <p> </p>  
              
              <li>
                <a href="https://arxiv.org/abs/2210.07055">
                <papertitle>Sparse in Space and Time: Audio-visual Synchronisation with Trainable Selectors.</papertitle>
                </a>
                <br>
                Vladimir Iashin, <strong>Weidi Xie</strong>, Esa Rahtu, Andrew Zisserman
                <br> 
                In: <em>British Machine Vision Conference (BMVC) </em>, 2022. &nbsp <font color="red"><strong>(Spotlight)</strong></font> 
                <br> 
                <a href="http://v-iashin.github.io/SparseSync">Project Page</a> |
                <a href="https://arxiv.org/abs/2210.07055">Arxiv</a>
                </li>
            <p> </p>   

                <li>
                    <a href="https://arxiv.org/abs/2208.13721">
                    <papertitle>CounTR: Transformer-based Generalised Visual Counting.</papertitle>
                    </a>
                    <br>
                    Chang Liu, Yujie Zhong, Andrew Zisserman, <strong>Weidi Xie</strong>
                    <br> 
                    In: <em>British Machine Vision Conference (BMVC) </em>, 2022. <br>
                    <a href="https://verg-avesta.github.io/CounTR_Webpage/">Project Page</a> |
                    <a href="https://arxiv.org/abs/2208.13721">Arxiv</a>
                    </li>
                <p> </p>   

                <li>
                  <a href="https://arxiv.org/pdf/2206.06947">
                  <papertitle>K-Space Transformer for Fast MRI Reconstruction.</papertitle>
                  </a>
                  <br>
                  Ziheng Zhao, Tianjiao Zhang, <strong>Weidi Xie&dagger;</strong>, Yanfeng Wang&dagger;, Ya Zhang
                  <br>
                  In: <em>British Machine Vision Conference (BMVC) </em>, 2022. <br>
                  <a href="https://zhaoziheng.github.io/Website/K-Space-Transformer">Project Page</a> |
                  <a href="https://arxiv.org/pdf/2206.06947">Arxiv</a>
                  </li>
                  <p> </p>  

                  <li>
                    <a href="">
                    <papertitle>Open-vocabulary Semantic Segmentation with Frozen Vision-Language Models.</papertitle>
                    </a>
                    <br>
                    Chaofan Ma, Yuhuan Yang, Yanfeng Wang, Ya Zhang, <strong>Weidi Xie</strong>
                    <br>
                    In: <em>British Machine Vision Conference (BMVC) </em>, 2022. &nbsp <font color="red"><strong>(Oral Presentation)</strong></font> 
                    <br> 
                    <a href="https://arxiv.org/abs/2210.15138">Arxiv</a> |
                    <a href="https://github.com/chaofanma/Fusioner">Code</a>
                    </li>
                    <p> </p>  
                    
                    <li>
                      <a href="https://www.robots.ox.ac.uk/~vgg/research/tpod/">
                      <papertitle>A Tri-Layer Plugin to Improve Occluded Detection.</papertitle>
                      </a>
                      <br>
                      Guanqi Zhan, <strong>Weidi Xie</strong>, Andrew Zisserman
                      <br>
                      In: <em>British Machine Vision Conference (BMVC) </em>, 2022. &nbsp <font color="red"><strong>(Oral Presentation)</strong></font> 
                      <br> 
                      <a href="https://www.robots.ox.ac.uk/~vgg/research/tpod/">Project Page</a> | 
                      <a href="https://arxiv.org/pdf/2210.10046">Arxiv</a>
                      </li>
                      <p> </p>  


                   <li>
                    <a href="https://omnimatte-sp.github.io">
                    <papertitle>Associating Objects and Their Effects in Video through Coordination Games.</papertitle>
                    </a>
                    <br>
                    Erika Lu, Forrester Cole, <strong>Weidi Xie</strong>, Tali Dekel, William T. Freeman, Andrew Zisserman, Michael Rubinstein
                    <br> 
                    In: <em>Conference on Neural Information Processing Systems (NeurIPS) </em>, 2022. <br>
                    <a href="https://omnimatte-sp.github.io">Project Page</a> |
                    <a href="https://openreview.net/pdf?id=hq-p55-qil9">Paper</a>
                    </li>
                <p> </p>  

                <li>
                  <a href="https://arxiv.org/abs/2206.07045">
                  <papertitle>ReCo: Retrieve and Co-segment for Zero-shot Transfer.</papertitle>
                  </a>
                  <br>
                  Gyungin Shin, <strong>Weidi Xie</strong>, Samuel Albanie
                  <br> 
                  In: <em>Conference on Neural Information Processing Systems (NeurIPS) </em>, 2022. <br>
                  <a href="https://www.robots.ox.ac.uk/~vgg/research/reco/">Project Page</a> |
                  <a href="https://arxiv.org/abs/2206.07045">Arxiv</a>
                  </li>
              <p> </p>  

                <li>
                    <a href="https://arxiv.org/abs/2207.02206">
                    <papertitle>Segmenting Moving Objects via an Object-Centric Layered Representation.</papertitle>
                    </a>
                    <br>
                    Junyu Xie, <strong>Weidi Xie</strong>, Andrew Zisserman
                    <br> 
                    In: <em>Conference on Neural Information Processing Systems (NeurIPS) </em>, 2022. <br>
                    <a href="">Project Page</a> |
                    <a href="https://arxiv.org/abs/2207.02206">Arxiv</a>
                    </li>
                <p> </p>   

                <li>
                    <a href="https://ju-chen.github.io/efficient-prompt/">
                    <papertitle>Prompting Visual-Language Models for Efficient Video Understanding.</papertitle>
                    </a>
                    <br>
                    Chen Ju, Tengda Han, Kunhao Zheng, Ya Zhang, <strong>Weidi Xie</strong>
                    <br>
                    In: <em>European Conference on Computer Vision (ECCV) </em>, 2022
                    <br> 
                    <a href="https://ju-chen.github.io/efficient-prompt/">Project Page</a> |
                    <a href="https://arxiv.org/pdf/2112.04478.pdf">Arxiv</a>
                    </li>
                <p> </p>   

                <li>
                  <a href="https://arxiv.org/abs/2203.16513">
                  <papertitle>PromptDet: Expand Your Detector Vocabulary with Uncurated Images.</papertitle>
                  </a>
                  <br>
                  Chengjian Feng, Yujie Zhong, Zequn Jie, Xiangxiang Chu, Haibing Ren, Xiaolin Wei, <strong>Weidi Xie&dagger;</strong>, Lin Ma
                  <br>
                  In: <em>European Conference on Computer Vision (ECCV) </em>, 2022
                  <br> 
                  <a href="https://fcjian.github.io/promptdet">Project Page</a> |
                  <a href="https://arxiv.org/abs/2203.16513">Arxiv</a> 
                  </li>
                  <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2206.12772">
                    <papertitle>Exploiting Transformation Invariance and Equivariance for Self-supervised Sound Localisation.</papertitle>
                    </a>
                    <br>
                    Jinxiang Liu, Chen Ju, <strong>Weidi Xie</strong>, Ya Zhang
                    <br>
                    In: <em>ACM Multimedia </em>, 2022.
                    <br> 
                    <a href="">Project Page</a> | <a href="https://arxiv.org/abs/2206.12772">Arxiv</a>
                    </li>
                    <p> </p> 
                  
                    <li>
                      <a href="">
                      <papertitle>Adaptive 3D Localization of 2D Freehand Ultrasound Brain Images.</papertitle>
                      </a>
                      <br>
                      Pak-Hei Yeung, Moska Aliasi, Monique Haak, the INTERGROWTH-21, <strong>Weidi Xie</strong>, Ana I.L. Namburete
                      <br>
                      In: <em> International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)</em>, 2022.
                      <br>
                      <a href="">Project Page</a> |
                      <a href="">Arxiv</a>
                      </li>
                    <p> </p>

                  <li>
                      <a href="">
                      <papertitle>Transforming the Interactive Segmentation for Medical Imaging.</papertitle>
                      </a>
                      <br>
                      Wentao Liu, Chaofan Ma, Yuhuan Yang, <strong>Weidi Xie</strong>, Ya Zhang
                      <br>
                      In: <em> International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)</em>, 2022.  &nbsp <font color="red"><strong>(Early Accept)</strong></font> 
                      <br>
                      <a href="https://wtliu7.github.io/tis/">Project Page</a> |
                      <a href="https://arxiv.org/abs/2208.09592">Arxiv</a>
                      </li>
                    <p> </p>  

                  <li>
                    <a href="https://arxiv.org/abs/2204.02968">
                    <papertitle>Temporal Alignment Networks for Long-term Video.</papertitle>
                    </a>
                    <br>
                    Tengda Han, <strong>Weidi Xie</strong>, Andrew Zisserman
                    <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2022. &nbsp <font color="red"><strong>(Oral Presentation)</strong></font> 
                    <br> 
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/tan/">Project Page</a> | <a href="https://arxiv.org/abs/2204.02968">Arxiv</a>
                    </li>
                    <p> </p>            
                
                <li>
                    <a href="https://arxiv.org/abs/2112.05749v1">
                    <papertitle>Label, Verify, Correct: A Simple Few Shot Object Detection Method.</papertitle>
                    </a>
                    <br>
                    Prannay Kaul, <strong>Weidi Xie</strong>, Andrew Zisserman
                    <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2022.
                    <br>
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/lvc/">Project Page</a> |
                    <a href="https://arxiv.org/abs/2112.05749v1">Arxiv</a>
                    </li>
                  <p> </p>        

               <li>
                    <a href="https://charigyang.github.io/abouttime/">
                    <papertitle>It's About Time: Analog Clock Reading in the Wild.</papertitle>
                    </a>
                    <br>
                    Charig Yang, <strong>Weidi Xie</strong>, Andrew Zisserman
                    <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR) </em>, 2022.
                    <br>
                    <a href="https://charigyang.github.io/abouttime/">Project Page</a> |
                    <a href="https://arxiv.org/abs/2111.09162">Arxiv</a>
                    </li>
                  <p> </p>  
                    
                <li>
                    <a href="https://arxiv.org/pdf/2203.12614.pdf">
                    <papertitle>Unsupervised Salient Object Detection with Spectral Cluster Voting.</papertitle>
                    </a>
                    <br>
                    Gyungin Shin, Samuel Albanie, <strong>Weidi Xie</strong>
                    <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition,  L3D-IVU Workshop </em>, 2022.
                    <br>
                    <a href="https://github.com/NoelShin/selfmask">Code</a> |
                    <a href="https://arxiv.org/pdf/2203.12614.pdf">Arxiv</a>
                    </li>
                    <p> </p>
                    
                </li>
                    <p> </p>
                    <li>
                    <a href="https://arxiv.org/abs/2103.14653">
                    <papertitle>Quantum Self-supervised Learning.</papertitle>
                    </a>
                    <br>
                        Ben Jaderberg, Lewis W. Anderson, <strong>Weidi Xie</strong>, Samuel Albanie, Martin Kiffner, Dieter Jaksch
                    <br>
                        In: <em>Quantum Science and Technology, 2022 (Impact Factor: ~5.2)</em>
                    <br>
                    <a href="https://github.com/bjader/quantum-neural-network">Code</a> |
                    <a href="https://arxiv.org/abs/2103.14653">Arxiv</a>
                    </li>
                    <p> </p>

                    <li>
                      <a href="https://www.sciencedirect.com/science/article/pii/S1053811922002452">
                      <papertitle>Subcortical Segmentation Of The Fetal Brain in 3D Ultrasound Using Deep Learning.</papertitle>
                      </a>
                      <br>
                      Linde S.Hesse, Moska Aliasi, Felipe Moser, the INTERGROWTH-21st Consortium, Monique C. Haak, <strong>Weidi Xie</strong>, Mark Jenkinson, Ana I.L. Namburete
                      <br>
                      In: <em>NeuroImage</em>, Volume 254, July, 2022. (Impact Factor: ~6.5) <br>
                      <a href="https://www.sciencedirect.com/science/article/pii/S1053811922002452">Link</a>
                      </li>
                      <p> </p>
                  
                </ol>
                <h2>2021</h2>
                <ol>
                  <li>
                    <a href="https://arxiv.org/abs/2109.12108">
                    <papertitle>ImplicitVol: Sensorless 3D Ultrasound Reconstruction with Deep Implicit Representation.</papertitle>
                    </a>
                    <br>
                      Pak-Hei Yeung, Linde Hesse, Moska Aliasi, Monique Haak, the INTERGROWTH-21st Consortium, <strong>Weidi Xie</strong>*, Ana I.L. Namburete*
                    <br>
                     <a href="https://pakheiyeung.github.io/ImplicitVol_wp/">Project Page</a> |
                    <a href="https://arxiv.org/abs/2109.12108">Arxiv</a>
                    </li>
                    <p> </p>
  
                  <li>
                    <a href="https://www.robots.ox.ac.uk/~vgg/publications/2021/Lamdouar21/lamdouar21.pdf">
                    <papertitle>Segmenting Invisible Moving Objects.</papertitle>
                    </a>
                    <br>
                      Hala Lamdouar,  <strong>Weidi Xie</strong>, Andrew Zisserman
                    <br>
                      In: <em> British Machine Vision Conference (BMVC)</em>, 2021.
                    <br>
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/simo/">Project Page</a> |
                    <a href="paper/bmvc2021-motion segmentation.pdf">Paper</a>
                    <p> </p>

                  <li>
                    <a href="https://www.robots.ox.ac.uk/~vgg/publications/2021/Chen21b/chen21b.pdf">
                    <papertitle>Audio-Visual Synchronisation In the Wild.</papertitle>
                    </a>
                    <br>
                      Honglie Chen,  <strong>Weidi Xie</strong>, Triantafyllos Afouras, Arsha Nagrani, Andrea Vedaldi, Andrew Zisserman
                     <br>
                       In: <em> British Machine Vision Conference (BMVC)</em>, 2021.
                    <br>
                    <a href="https://www.robots.ox.ac.uk/~vgg/ research/avs">Project Page</a> |
                    <a href="paper/bmvc2021-audio-visual sync.pdf">Paper</a>
                    <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2104.06394">
                    <papertitle>All You Need Are a Few Pixels: Semantic Segmentation with PixelPick.</papertitle>
                    </a>
                    <br>
                      Gyungin Shin,  <strong>Weidi Xie</strong>, Samuel Albanie
                    <br>
                      In: <em> International Conference on Computer Vision (ICCV),
                      <a href="https://ildav-workshop.github.io"> ILDAV Workshop</a> </em>, 2021. &nbsp <font color="red"><strong>(Best Paper Award)</strong></font>
                    <br>
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/pixelpick/">Project Page</a> |
                    <a href="https://arxiv.org/abs/2104.06394">Arxiv</a>
                    <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2102.07064">
                    <papertitle>NeRF--: Neural Radiance Fields Without Known Camera Parameters.</papertitle>
                    </a>
                    <br>
                      Zirui Wang, Shangzhe Wu,  <strong>Weidi Xie</strong>, Min Chen, Victor Adrian Prisacariu
                    <br>
                    <a href="https://nerfmm.active.vision">Project Page</a> |
                    <a href="https://arxiv.org/abs/2102.07064">Arxiv</a>
                    <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2104.07658">
                    <papertitle>Self-supervised Video Object Segmentation by Motion Grouping.</papertitle>
                    </a>
                    <br>
                      Charig Yang, Hala Lamdouar, Erika Lu, Andrew Zisserman, <strong>Weidi Xie</strong>
                    <br>
                      In: <em> International Conference on Computer Vision (ICCV)</em>, 2021.
                    <br>
                      <a href="https://charigyang.github.io/motiongroup/">Project Page</a> |
                      <a href="https://arxiv.org/abs/2104.07658">Arxiv</a>
                    <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2105.12722">
                    <papertitle>Sli2Vol: Annotate a 3D Volume from a Single Slice with Self-Supervised Learning.</papertitle>
                    </a>
                    <br>
                      Pak Hei Yeung, Ana I.L. Namburete, <strong>Weidi Xie</strong>
                    <br>
                      In: <em> International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)</em>, 2021.
                    <br>
                      <a href="https://pakheiyeung.github.io/Sli2Vol_wp/">Project Page</a> |
                      <a href="https://arxiv.org/abs/2105.12722">Arxiv</a>
                    <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2104.07658">
                    <papertitle>Self-supervised Video Object Segmentation by Motion Grouping (Short Version).</papertitle>
                    </a>
                    <br>
                      Charig Yang, Hala Lamdouar, Erika Lu, Andrew Zisserman, <strong>Weidi Xie</strong>
                    <br>
                      In: <em>Conference on Computer Vision and Pattern Recognition (CVPR),
                      <a href="https://eval.vision.rwth-aachen.de/rvsu-workshop21/"> RVSU Workshop</a> </em>, 2021.
                      &nbsp <font color="red"><strong>(Best Paper Award)</strong></font>
                      <br>
                      <a href="https://charigyang.github.io/motiongroup/">Project Page</a> |
                      <a href="https://arxiv.org/abs/2104.07658">Arxiv</a>
                    <p> </p>

                  <li>
                    <a href="https://arxiv.org/abs/2104.02691">
                    <papertitle>Localizing Visual Sounds the Hard Way.</papertitle></a>
                    <br>
                    Honglie Chen, <b>Weidi Xie</b>, Triantafyllos Afouras, Arsha Nagrani, Andrea Vedaldi, Andrew Zisserman
                    <br>
                    In: <em>Conference on Computer Vision and Pattern Recognition (CVPR)</em>, 2021
                    <br>
                    <a href="https://www.robots.ox.ac.uk/~vgg/research/lvs/">Project Page</a> |
                    <a href="https://arxiv.org/abs/2104.02691">Arxiv</a> </li>
                    <p> </p>

                  <li>
                    <a href="https://www.sciencedirect.com/science/article/pii/S136184152100044X#!">
                    <papertitle>Learning to Map 2D Ultrasound Images into 3D Space with Minimal Human Annotation.</papertitle></a>
                    <br>
                    Pak-Hei Yeung, Moska Aliasi, Aris T. Papageorghiou, Monique Haak, <b>Weidi Xie</b>, Ana I.L. Namburete.
                    <br>
                    In: <em>Medical Image Analysis, 2021. (Impact Factor: ~11)</em>
                   <br>
                    <a href="https://pakheiyeung.github.io/PlaneInVol_wp/">Project Page</a> |
                    <a href="https://www.sciencedirect.com/science/article/pii/S136184152100044X#!">Paper</a> </li>
                    <p> </p>

                </ol>
                <h2>2020</h2>
                <ol>
                  <li>
                    <a href="https://arxiv.org/pdf/2012.06867.pdf">
                    <papertitle>VoxSRC 2020: The Second VoxCeleb Speaker Recognition Challenge.</papertitle>
                    </a>
                    <br>
                    Arsha Nagrani,
                    Joon Son Chung,
                    Jaesung Huh,
                    Andrew Brown,
                    Ernesto Coto,
                    <strong>Weidi Xie</strong>,
                    Mitchell McLaren,
                    Douglas A Reynolds,
                    Andrew Zisserman.<br>
                    <a href="https://www.sciencedirect.com/science/article/pii/S136184152100044X#!"> Tech Report</a></li>
                    <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/2010.09709">
                  <papertitle>Self-supervised Co-training for Video Representation Learning.</papertitle>
                  </a>
                  <br>
                  Tengda Han,
                  <strong>Weidi Xie</strong>, Andrew Zisserman <br>
                  In: <em>Conference on Neural Information Processing Systems (NeurIPS) </em>, 2020. <br>
                  <a href="https://arxiv.org/abs/2010.09709">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/research/CoCLR/">Project Page</a> |
                  <a href="https://github.com/TengdaHan/CoCLR">Code & Model</a></li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/2011.11630">
                  <papertitle>Betrayed by Motion: Camouflaged Object Discovery via Motion Segmentation.</papertitle>
                  </a>
                  <br>
                  Hala Lamdouar, Charig Yang, <strong>Weidi Xie</strong>, Andrew Zisserman<br>
                  In: <em>Asian Conference on Computer Vision (ACCV)</em>, 2020. <br>
                  <a href="https://arxiv.org/abs/2011.11630">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2020/Lamdouar20/lamdouar20.pdf">PDF</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/data/MoCA/">Project Page</a></li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/2009.07833">
                  <papertitle>Layered Neural Rendering for Retiming People in Video.</papertitle>
                  </a>
                  <br>
                  Erika Lu, Forrester Cole, Tali Dekel, <strong>Weidi Xie</strong>, Andrew Zisserman, David Salesin, William T. Freeman, Michael Rubinstein<br>
                  In: <em>ACM Transactions on Graphics (TOG). Proc. SIGGRAPH Asia </em>, 2020<br>

                  <a href="https://arxiv.org/abs/2009.07833">Arxiv</a> |
                  <a href="https://retiming.github.io">Project Page</a></li>
                  <p> </p>

                  <li>
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2020/Xie20/xie20.pdf">
                  <papertitle>Inducing Predictive Uncertainty Estimation for Face Recognition.</papertitle>
                  </a>
                  <br>
                  <strong>Weidi Xie</strong>, Jeffrey Byrne, Andrew Zisserman<br>
                  In: <em>British Machine Vision Conference (BMVC) </em>, 2020<br>
                  <a href="https://arxiv.org/abs/2009.00603">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2020/Xie20/xie20.pdf">PDF</a></li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/2007.12163">
                  <papertitle>Smooth-AP: Smoothing the Path Towards Large-Scale Image Retrieval.</papertitle>
                  </a>
                  <br>
                  Andrew Brown, <strong>Weidi Xie</strong>, Vicky Kalogeiton, Andrew Zisserman<br>
                  In: <em>European Conference on Computer Vision (ECCV) </em>, 2020<br>
                   <a href="https://arxiv.org/abs/2007.12163">Arxiv</a> |
                   <a href="https://www.robots.ox.ac.uk/~vgg/research/smooth-ap/">Project Page</a> |
                  <a href="https://github.com/Andrew-Brown1/Smooth_AP">Code & Model</a></li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/2008.01065">
                  <papertitle>Memory-augmented Dense Predictive Coding for Video Representation Learning.</papertitle>
                  </a>
                  <br>
                  Tengda Han,
                  <strong>Weidi Xie</strong>, Andrew Zisserman <br>
                  In: <em>European Conference on Computer Vision (ECCV) </em>, 2020
                  &nbsp <font color="red"><strong>(Spotlight Presentation)</strong></font> <br>
                  <a href="https://arxiv.org/abs/2008.01065">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/research/DPC/">Project Page</a> |
                  <a href="https://tengdahan.github.io">Code & Model</a></li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/2002.07793">
                  <papertitle>MAST: A Memory-Augmented Self-Supervised Tracker.</papertitle>
                  </a>
                  <br>
                  Zihang Lai,
                  Erika Lu,
                  <strong>Weidi Xie</strong> <br>
                  In: <em>Conference on Computer Vision and Pattern Recognition (CVPR)</em>, 2020<br>
                  <a href="https://arxiv.org/abs/2002.07793">Arxiv</a> |
                  <a href="https://zlai0.github.io/MAST/">Project Page</a> |
                  <a href="https://github.com/zlai0/MAST">Code & Model</a></li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/2004.14368">
                  <papertitle>VGG-Sound: A Large-Scale Audio-Visual Dataset.</papertitle>
                  </a>
                  <br>
                  Honglie Chen,
                  <strong>Weidi Xie</strong>,
                  Andrea Vedaldi,
                  Andrew Zisserman <br>
                  In: <em>International Conference on Acoustics, Speech, and Signal Processing (ICASSP)</em>, 2020<br>
                  <a href="https://arxiv.org/abs/2004.14368">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2020/Chen20/chen20.pdf">PDF</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/data/vggsound/">Project Page</a> |
                  <a href="https://github.com/hche11/VGGSound">Code & Model</a> </li>
                  <p> </p>

                  <li>
                  <a href="https://ieeexplore.ieee.org/document/8999615">
                  <papertitle>Low-Memory CNNs Enabling Real-Time Ultrasound Segmentation Towards Mobile Deployment.</papertitle>
                  </a>
                  <br>
                  Sagar Vaze, <strong>Weidi Xie</strong>, Ana Namburete. <br>
                  In: <em>IEEE Journal of Biomedical and Health Informatics</em>, 2020. (Impact Factor: ~7)<br>
                  <a href="https://sgvaze.github.io/pages/lightweight_unets.html">Project Page</a>  |
                  <a href="https://github.com/sgvaze/lightweight_unet">Code</a></li>
                  <p> </p>

                  <li>
                  <a href="https://www.sciencedirect.com/science/article/pii/S0885230819302712">
                  <papertitle>VoxCeleb: Large-scale Speaker Verification in the Wild.</papertitle>
                  </a>
                  <br>
                  Arsha Nagrani*, Joon Son Chung*, <strong>Weidi Xie*</strong>,
                  Andrew Zisserman.  (* indicates equal contribution)<br>
                  In: <em>Computer Speech & Language</em>, 2020. (Impact Factor: ~1.8)<br>
                  <a href="https://www.sciencedirect.com/science/article/pii/S0885230819302712">Paper</a></li>
                  <p> </p>
                </ol>


                <h2>2019</h2>
                <ol>
                  <li>
                  <a href="https://arxiv.org/pdf/1912.02522.pdf">
                  <papertitle>VoxSRC 2019: The first VoxCeleb Speaker Recognition Challenge.</papertitle>
                  </a>
                  <br>
                  Joon Son Chung, Arsha Nagrani,
                  Ernesto Coto,
                  <strong>Weidi Xie</strong>,
                  Mitchell McLaren, Douglas A Reynolds,
                  Andrew Zisserman.<br>
                  <a href="https://arxiv.org/pdf/1912.02522.pdf">Tech Report</a></li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/1909.04656">
                  <papertitle>Video Representation Learning by Dense Predictive Coding.</papertitle>
                  </a>
                  <br> Tengda Han,
                  <strong>Weidi Xie</strong>,
                  Andrew Zisserman<br>
                  In: <em>1st International Workshop on Large-scale Holistic Video Understanding, ICCV</em>, 2019.
                  &nbsp <font color="red"><strong>(Oral Presentation)</strong></font> <br>
                  <a href="https://arxiv.org/abs/1909.04656">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/research/DPC/">Project Page</a> |
                  <a href="https://github.com/TengdaHan/DPC">Code</a> </li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/1905.00875">
                  <papertitle>Self-supervised Learning for Video Correspondence Flow.</papertitle>
                  </a>
                  <br> Zihang Lai,
                  <strong>Weidi Xie</strong> <br>
                  In: <em>British Machine Vision Conference (BMVC)</em>, 2019.
                  &nbsp <font color="red"><strong>(Oral Presentation)</strong></font> <br>
                  <a href="https://arxiv.org/abs/1905.00875">Arxiv</a> |
                  <a href="https://zlai0.github.io/CorrFlow/">Project Page</a> </li>
                  <p> </p>

                  <li>
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2019/Chen19/chen19.pdf">
                  <papertitle>AutoCorrect: Deep Inductive Alignment of Noisy Geometric Annotations.</papertitle>
                  </a>
                  <br> Honglie Chen,
                  <strong>Weidi Xie</strong>,
                  Andrea Vedaldi,
                  Andrew Zisserman. <br>
                  In: <em>British Machine Vision Conference (BMVC)</em>, 2019.
                  &nbsp <font color="red"><strong>(Spotlight Presentation)</strong></font> <br>
                  <a href="https://arxiv.org/abs/1908.05263">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2019/Chen19/chen19.pdf">PDF</a> </li>
                  <p> </p>

                  <li>
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2019/xu19/xu19.pdf">
                  <papertitle>Geometry-Aware Corner Network for Video Object Detection from Static Cameras.</papertitle>
                  </a>
                  <br> Dan Xu,
                  <strong>Weidi Xie</strong>,
                  Andrew Zisserman. <br>
                  In: <em>British Machine Vision Conference (BMVC)</em>, 2019.
                  &nbsp <font color="red"><strong>(Oral Presentation)</strong></font> <br>
                  <a href="https://arxiv.org/abs/1909.03140">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2019/xu19/xu19.pdf">PDF</a> </li>
                  <p> </p>

                  <li>
                  <a href="https://arxiv.org/abs/1902.10107">
                  <papertitle>Utterance-level Aggregation for Speaker Recognition in the Wild.</papertitle>
                  </a>
                  <br>
                  <strong>Weidi Xie</strong>,
                  Arsha Nagrani, Joon Son Chung, Andrew Zisserman. <br>
                  In: <em>International Conference on Acoustics, Speech, and Signal Processing (ICASSP)</em>, 2019.
                  &nbsp <font color="red"><strong>(Oral Presentation)</strong></font> <br>
                  <a href="https://arxiv.org/abs/1902.10107">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/research/speakerID/">Project Page</a> |
                  <a href="https://github.com/WeidiXie/VGG-Speaker-Recognition">Code & Model</a></li>
                  <p> </p>

                </ol>
                <h2>2018</h2>
                <ol>
                  <li>
                  <a href="https://www.robots.ox.ac.uk/~vgg/publications/2018/Xie18a/xie18a.pdf">
                  <papertitle>Comparator Networks.</papertitle>
                  </a>
                  <br>
                  <strong>Weidi Xie</strong>, Li Shen, Andrew Zisserman
                  <br>
                  In: <em>European Conference on Computer Vision (ECCV)</em>, 2018.
                  <br>
                  <a href="https://arxiv.org/abs/1807.11440">Arxiv</a> |
                  <a href="https://www.robots.ox.ac.uk/~vgg/publications/2018/Xie18a/xie18a.pdf">PDF</a></li>
                  <p> </p>

                  <li>
                  <a href="https://www.robots.ox.ac.uk/~vgg/publications/2018/Xie18b/xie18b.pdf">
                  <papertitle>Multicolumn Networks on Face Recognition.</papertitle>
                  </a>
                  <br>
                  <strong>Weidi Xie</strong>, Andrew Zisserman
                  <br>
                  In: <em>British Machine Vision Conference (BMVC)</em>, 2018.
                  <br>
                  <a href="https://arxiv.org/abs/1807.09192">Arxiv</a> |
                  <a href="https://www.robots.ox.ac.uk/~vgg/publications/2018/Xie18b/xie18b.pdf">PDF</a> |
                  <a href="https://github.com/WeidiXie/multicoumn_network">Code & Model</a> |
                  <a href="data/XieBMVC2018.bib">Bibtex</a></li>
                  <p> </p>

                  <li>
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2018/Lu18/lu18.pdf">
                  <papertitle>Class-Agnostic Counting.</papertitle>
                  </a>
                  <br>
                  Erika Lu, <strong>Weidi Xie</strong>, Andrew Zisserman
                  <br>
                  In: <em>Asian Conference on Computer Vision (ACCV)</em>, 2018.
                  <br>
                  <a href="https://arxiv.org/abs/1811.00472">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/research/class-agnostic-counting/">Project Page</a> |
                  <a href="data/XieBMVC2018.bib">Bibtex</a></li>
                  <p> </p>

                  <li>
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2018/Cao18/cao18.pdf">
                  <papertitle>VGGFace2: A Dataset for Recognising Faces Across Pose and Age.</papertitle>
                  </a>
                  <br>
                  Qiong Cao, Li Shen, <strong>Weidi Xie</strong>, Omkar M. Parkhi and Andrew Zisserman
                  <br>
                  In: <em>IEEE International Conference on Automatic Face and Gesture Recognition (F&G)</em>, 2018.
                  &nbsp <font color="red"><strong>(Oral Presentation)</strong></font>
                  <br>
                  <a href="https://arxiv.org/abs/1710.08092">Arxiv</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2018/Cao18/cao18.pdf">PDF</a> |
                  <a href="http://www.robots.ox.ac.uk/~vgg/data/vgg_face2/">Project Page</a> |
                  <a href="data/CaoFG2018.bib">Bibtex</a></li>
                  <p> </p>

                  <li>
                  <a href="https://www.sciencedirect.com/science/article/abs/pii/S1361841518302998">
                  <papertitle>Omega-Net: Fully Automatic, Multi-View Cardiac MR Detection, Orientation, and Segmentation with Deep Neural Networks.</papertitle>
                  </a>
                  <br>
                  <strong>Weidi Xie*, Davis M. Vigneault*</strong>, Carolyn Y. Ho, David A. Bluemke and J. Alison Noble (*joint first author)
                  <br>
                  In: <em>Medical Image Analysis, Volume 48, Pages 95, August 2018. (Impact Factor: ~11)</em>
                  <br>
                  <a href="https://arxiv.org/abs/1711.01094">Arxiv</a> |
                  <a href="https://www.sciencedirect.com/science/article/abs/pii/S1361841518302998">Paper</a></li>
                  <p> </p>

                  <li>
                  <a href="https://www.sciencedirect.com/science/article/abs/pii/S1361841518301920">
                  <papertitle>VP-Nets: Efficient Automatic Localization of Key Brain Structures in 3D Fetal Neurosonography.</papertitle>
                  </a>
                  <br>
                  Ruobing Huang, <strong>Weidi Xie</strong> and J. Alison Noble
                  <br>
                  In: <em>Medical Image Analysis, Volume 47, Pages 127, July 2018. (Impact Factor: ~11)</em>
                  <br>
                  <a href="https://www.sciencedirect.com/science/article/abs/pii/S1361841518301920">Paper</a></li>
                  <p> </p>

                  <li>
                  <a href="https://www.sciencedirect.com/science/article/abs/pii/S1361841518300306">
                  <papertitle>Fully-Automated Alignment of 3D Fetal Brain Ultrasound to a Canonical Reference Space Using Multi-task Learning.</papertitle>
                  </a>
                  <br>
                  <strong>Weidi Xie*, Ana I.L. Namburete*</strong>,   Mohammad Yaqub,
                  Andrew Zisserman and J. Alison Noble (*joint first author)
                  <br>
                  In: <em>Medical Image Analysis, Volume 46, Pages 1, May 2018. (Impact Factor: ~11)</em>
                  <br>
                  <a href="https://www.sciencedirect.com/science/article/abs/pii/S1361841518300306">Paper</a></li>
                  <p> </p>


                </ol>
                <h2>2017</h2>
                <ol>
                  <li>
                  <a href="https://link.springer.com/chapter/10.1007/978-3-319-59448-4_18">
                  <papertitle>Feature Tracking Cardiac Magnetic Resonance via Deep Learning and Spline Optimization.</papertitle>
                  </a>
                  <br>
                  Davis M. Vigneaulta, <strong>Weidi Xie</strong>, David A. Bluemke and J. Alison Noble
                  <br>
                  In: <em>Functional Imaging and Modelling of the Heart (FIMH)</em>, 2017.
                  &nbsp <font color="red"><strong>(Best Poster Award)</strong></font>
                  <br>
                  <a href="https://arxiv.org/abs/1704.03660">Arxiv</a> |
                  <a href="https://link.springer.com/chapter/10.1007/978-3-319-59448-4_18">Paper</a></li>
                  <p> </p>

                  <li>
                  <a href="https://link.springer.com/chapter/10.1007%2F978-3-319-67561-9_8">
                  <papertitle>Robust Regression of Brain Maturation from 3D Fetal Neurosonography using CRNs.</papertitle>
                  </a>
                  <br>
                  Ana I.L. Namburete, <strong>Weidi Xie</strong> and J. Alison Noble
                  <br>
                  In: <em>MICCAI Workshop on Fetal and InFant Image analysis (FIFI)</em>, 2017.
                  &nbsp <font color="red"><strong>(Best Paper Award)</strong></font>
                  <br>
                  <a href="https://www.dropbox.com/s/ypyita3gabr2cs4/3d_brain_age.pdf?dl=0">Paper</a></li>
                  <p> </p>


                </ol>
                <h2>2016</h2>
                <ol>
                  <li>
                  <a href="https://www.tandfonline.com/doi/full/10.1080/21681163.2016.1149104">
                  <papertitle>Microscopy Cell Counting and Detection with Fully Convolutional Regression Networks.</papertitle>
                  </a>
                  <br>
                  <strong>Weidi Xie</strong>, J. Alison Noble and Andrew Zisserman
                  <br>
                  In: <em>MICCAI 1st Deep Learning Workshop</em>, 2015.
                  <br>
                  In: <em>Computer Methods in Biomechanics and Biomedical Engineering: Imaging & Visualization</em>, 2016.
                  &nbsp <font color="red"><strong>(Biannual Best Journal Article)</strong></font>
                  <br>
                  <a href="http://www.robots.ox.ac.uk/~vgg/publications/2016/Xie16/xie16.pdf">Paper</a> |
                  <a href="https://github.com/WeidiXie/cell_counting_v2">Code</a> |
                  <a href="https://think.taylorandfrancis.com/journal-prize-computer-methods-in-biomechanics-and-biomedical-engineering-imaging-visualization-best-paper-award/">Award</a></li>
                  <p> </p>
                </ol>
              </td>
            </tr>
            </table>

            
            <!--<table width="100%" align="center" border="0" cellspacing="0" cellpadding="20">
              <tr>
                <td width="25%">
                        <img src="images/SILVER.gif">
                      </td>
                      <td valign="middle" width="75%">
                        
                          <papertitle>SILVER - an Improvement upon Radial Golden Ratio Sampling</papertitle>
                        
                        <br>
                        <strong>S. Sophie Schauman</strong>, 
                        <a href="https://www.ndcn.ox.ac.uk/team/thomas-okell">Thomas W. Okell</a>,
                        <a href="https://www.ndcn.ox.ac.uk/team/mark-chiew">Mark Chiew</a> 
                        <br>
                        <em>Submitted to the ISMRM Workshop on Sampling and Image Reconstruction, Sedona, USA</em>, 2020
                        <br>
                  <a href="https://github.com/SophieSchau/SILVER">GitHub</a>, <a href="data/posters/ISMRM_WS_SEDONA2020.pdf">Poster</a>
                        <br>
                        <p></p>
                        <p>Radial sampling in MRI has many advantages over Cartesian trajectories, including less coherent aliasing when undersampled and robustness to motion. Uniform radial sampling provides the highest SNR efficiency but lacks flexibility in choosing temporal windows for reconstruction. Golden means sampling, on the other hand, provide near-optimal efficiency for arbitrary window sizes. The Golden means method is based on doing a set increment that fills k-space as uniformly as possible. A constant increment is beneficial as it makes the efficiency of the reconstruction shift-invariant, allowing for flexible reconstruction windows starting from any spoke.
          We aim to show that by relaxing the requirement to have close to uniform sampling for any window size, by instead optimizing for a restricted window size range of interest, higher efficiency can be achieved by using different increments. This is a reasonable limitation, as meaningful images are never reconstructed from just one or a handful of spokes and uniformity is not important once fully sampled. We call this the Set Increment with Limited Views Enhancing Ratio (SILVER) method.
          </p>
                      </td>
                    </tr>
          
          
                    <tr>
                      <td width="25%">
                        <img src="images/ISMRM2019.gif">
                      </td>
                      <td valign="middle" width="75%">
                        
                          <papertitle>Accelerated Vessel-Encoded Arterial Spin Labeling Angiography</papertitle>
                        
                        <br>
                        <strong>S. Sophie Schauman</strong>, 
                        <a href="https://www.ndcn.ox.ac.uk/team/mark-chiew">Mark Chiew</a>,
                        <a href="https://www.ndcn.ox.ac.uk/team/thomas-okell">Thomas W. Okell</a> 
                        <br>
                        <em>ISMRM Annual Meeting, Montreal, Canada</em>, 2019
                        <br>
                      <a href="hhttps://cds.ismrm.org/protected/19MPresentations/abstracts/0744.html">abstract</a>  (login needed)
                        <br>
                        <p></p>
                        <p>Vessel-encoded ASL can produce vessel-selective cerebral angiograms, but to separate blood from multiple arteries more images are needed than for standard ASL angiography, which increases scan time. Angiograms are however well suited for under-sampling and compressed sensing reconstruction because of their high intrinsic sparsity. In this work we demonstrate in-vivo that vessel-selective angiograms allow for higher acceleration factors, yielding comparable image quality to conventional angiography with matched scan time using 2D and 3D time-resolved golden angle radial acquisitions. With this optimised acquisition and reconstruction method, scan time of the 3D case can be reduced from 8:35 hours to ~5 minutes.</p>
                      </td>
                    </tr>
          
          <tr>
                      <td width="25%">
                        <img src="images/KSPACESIMU.gif">
                      </td>
                      <td valign="middle" width="75%">
                        
                          <papertitle>K-Space Simulator for Public Engagement</papertitle>
                        
                        <br>
                        <strong>S. Sophie Schauman</strong>, 
                        <a href="https://www.ndcn.ox.ac.uk/team/benjamin-tendler">Benjamin Tendler</a>,
                        <a href="https://www.ndcn.ox.ac.uk/team/stuart-clare">Stuart Clare</a> 
                        <br>
                        <em>Wellcome Centre for Integrative Neuroimaging (The Big Brain Roadshow)</em>, 2019
                        <br>
                  <a href="">Medium</a>, <a href="https://github.com/SophieSchau/TheImagedBrain">GitHub</a>
                        <br>
                        <p></p>
                        <p><i>The Big Brain Roadshow</i> is part of the Public Engagement work done at WIN. As Public Engagement Ambassador, one of my projects was to create a stall for 13-16-year-old children to learn about some aspect of the work physicists do in neuroscience labs. We came up with a way of showing the children that any image can be built up of waves (Fourier basis). The tool we built is a Matlab script that can process images shown to a webcam in real-time and produce the 2D Fourier transform of that image. The simulator also works in reverse. If you show the camera an image of k-space, with the real component in one colour channel and the imaginary component in another, it shows the image that that k-space represents without changing any settings.</p>
                      </td>
                    </tr>
          </table>-->
          
                    <table width="100%" align="center" border="0" cellspacing="0" cellpadding="20">
                    <tr>
                      <td>
                        <br>
                        <p align="center">
                          <font size="2">
                            Based on a template by <a href="https://jonbarron.info">Jon Barron</a>
                            </font>
                        </p>
                      </td>
                    </tr>
                  </table>
                  </td>
              </tr>
            </table>
          </body>
          
          </html>