@inproceedings{ouyang-etal-2024-climedbench,
title = "{C}li{M}ed{B}ench: A Large-Scale {C}hinese Benchmark for Evaluating Medical Large Language Models in Clinical Scenarios",
author = "Ouyang, Zetian and
Qiu, Yishuai and
Wang, Linlin and
De Melo, Gerard and
Zhang, Ya and
Wang, Yanfeng and
He, Liang",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.480/",
doi = "10.18653/v1/2024.emnlp-main.480",
pages = "8428--8438",
abstract = "With the proliferation of Large Language Models (LLMs) in diverse domains, there is a particular need for unified evaluation standards in clinical medical scenarios, where models need to be examined very thoroughly. We present CliMedBench, a comprehensive benchmark with 14 expert-guided core clinical scenarios specifically designed to assess the medical ability of LLMs across 7 pivot dimensions. It comprises 33,735 questions derived from real-world medical reports of top-tier tertiary hospitals and authentic examination exercises. The reliability of this benchmark has been confirmed in several ways. Subsequent experiments with existing LLMs have led to the following findings: (i) Chinese medical LLMs underperform on this benchmark, especially where medical reasoning and factual consistency are vital, underscoring the need for advances in clinical knowledge and diagnostic accuracy. (ii) Several general-domain LLMs demonstrate substantial potential in medical clinics, while the limited input capacity of many medical LLMs hinders their practical use. These findings reveal both the strengths and limitations of LLMs in clinical scenarios and offer critical insights for medical research."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ouyang-etal-2024-climedbench">
<titleInfo>
<title>CliMedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large Language Models in Clinical Scenarios</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zetian</namePart>
<namePart type="family">Ouyang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yishuai</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linlin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerard</namePart>
<namePart type="family">De Melo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ya</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanfeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the proliferation of Large Language Models (LLMs) in diverse domains, there is a particular need for unified evaluation standards in clinical medical scenarios, where models need to be examined very thoroughly. We present CliMedBench, a comprehensive benchmark with 14 expert-guided core clinical scenarios specifically designed to assess the medical ability of LLMs across 7 pivot dimensions. It comprises 33,735 questions derived from real-world medical reports of top-tier tertiary hospitals and authentic examination exercises. The reliability of this benchmark has been confirmed in several ways. Subsequent experiments with existing LLMs have led to the following findings: (i) Chinese medical LLMs underperform on this benchmark, especially where medical reasoning and factual consistency are vital, underscoring the need for advances in clinical knowledge and diagnostic accuracy. (ii) Several general-domain LLMs demonstrate substantial potential in medical clinics, while the limited input capacity of many medical LLMs hinders their practical use. These findings reveal both the strengths and limitations of LLMs in clinical scenarios and offer critical insights for medical research.</abstract>
<identifier type="citekey">ouyang-etal-2024-climedbench</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.480</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.480/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>8428</start>
<end>8438</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CliMedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large Language Models in Clinical Scenarios
%A Ouyang, Zetian
%A Qiu, Yishuai
%A Wang, Linlin
%A De Melo, Gerard
%A Zhang, Ya
%A Wang, Yanfeng
%A He, Liang
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ouyang-etal-2024-climedbench
%X With the proliferation of Large Language Models (LLMs) in diverse domains, there is a particular need for unified evaluation standards in clinical medical scenarios, where models need to be examined very thoroughly. We present CliMedBench, a comprehensive benchmark with 14 expert-guided core clinical scenarios specifically designed to assess the medical ability of LLMs across 7 pivot dimensions. It comprises 33,735 questions derived from real-world medical reports of top-tier tertiary hospitals and authentic examination exercises. The reliability of this benchmark has been confirmed in several ways. Subsequent experiments with existing LLMs have led to the following findings: (i) Chinese medical LLMs underperform on this benchmark, especially where medical reasoning and factual consistency are vital, underscoring the need for advances in clinical knowledge and diagnostic accuracy. (ii) Several general-domain LLMs demonstrate substantial potential in medical clinics, while the limited input capacity of many medical LLMs hinders their practical use. These findings reveal both the strengths and limitations of LLMs in clinical scenarios and offer critical insights for medical research.
%R 10.18653/v1/2024.emnlp-main.480
%U https://aclanthology.org/2024.emnlp-main.480/
%U https://doi.org/10.18653/v1/2024.emnlp-main.480
%P 8428-8438
Markdown (Informal)
[CliMedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large Language Models in Clinical Scenarios](https://aclanthology.org/2024.emnlp-main.480/) (Ouyang et al., EMNLP 2024)
ACL
- Zetian Ouyang, Yishuai Qiu, Linlin Wang, Gerard De Melo, Ya Zhang, Yanfeng Wang, and Liang He. 2024. CliMedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large Language Models in Clinical Scenarios. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 8428–8438, Miami, Florida, USA. Association for Computational Linguistics.