@inproceedings{lamproudis-etal-2021-developing,
title = "Developing a Clinical Language Model for {S}wedish: Continued Pretraining of Generic {BERT} with In-Domain Data",
author = "Lamproudis, Anastasios and
Henriksson, Aron and
Dalianis, Hercules",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.ranlp-1.90",
pages = "790--797",
abstract = "The use of pretrained language models, fine-tuned to perform a specific downstream task, has become widespread in NLP. Using a generic language model in specialized domains may, however, be sub-optimal due to differences in language use and vocabulary. In this paper, it is investigated whether an existing, generic language model for Swedish can be improved for the clinical domain through continued pretraining with clinical text. The generic and domain-specific language models are fine-tuned and evaluated on three representative clinical NLP tasks: (i) identifying protected health information, (ii) assigning ICD-10 diagnosis codes to discharge summaries, and (iii) sentence-level uncertainty prediction. The results show that continued pretraining on in-domain data leads to improved performance on all three downstream tasks, indicating that there is a potential added value of domain-specific language models for clinical NLP.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lamproudis-etal-2021-developing">
<titleInfo>
<title>Developing a Clinical Language Model for Swedish: Continued Pretraining of Generic BERT with In-Domain Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anastasios</namePart>
<namePart type="family">Lamproudis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aron</namePart>
<namePart type="family">Henriksson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hercules</namePart>
<namePart type="family">Dalianis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Held Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The use of pretrained language models, fine-tuned to perform a specific downstream task, has become widespread in NLP. Using a generic language model in specialized domains may, however, be sub-optimal due to differences in language use and vocabulary. In this paper, it is investigated whether an existing, generic language model for Swedish can be improved for the clinical domain through continued pretraining with clinical text. The generic and domain-specific language models are fine-tuned and evaluated on three representative clinical NLP tasks: (i) identifying protected health information, (ii) assigning ICD-10 diagnosis codes to discharge summaries, and (iii) sentence-level uncertainty prediction. The results show that continued pretraining on in-domain data leads to improved performance on all three downstream tasks, indicating that there is a potential added value of domain-specific language models for clinical NLP.</abstract>
<identifier type="citekey">lamproudis-etal-2021-developing</identifier>
<location>
<url>https://aclanthology.org/2021.ranlp-1.90</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>790</start>
<end>797</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Developing a Clinical Language Model for Swedish: Continued Pretraining of Generic BERT with In-Domain Data
%A Lamproudis, Anastasios
%A Henriksson, Aron
%A Dalianis, Hercules
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Held Online
%F lamproudis-etal-2021-developing
%X The use of pretrained language models, fine-tuned to perform a specific downstream task, has become widespread in NLP. Using a generic language model in specialized domains may, however, be sub-optimal due to differences in language use and vocabulary. In this paper, it is investigated whether an existing, generic language model for Swedish can be improved for the clinical domain through continued pretraining with clinical text. The generic and domain-specific language models are fine-tuned and evaluated on three representative clinical NLP tasks: (i) identifying protected health information, (ii) assigning ICD-10 diagnosis codes to discharge summaries, and (iii) sentence-level uncertainty prediction. The results show that continued pretraining on in-domain data leads to improved performance on all three downstream tasks, indicating that there is a potential added value of domain-specific language models for clinical NLP.
%U https://aclanthology.org/2021.ranlp-1.90
%P 790-797
Markdown (Informal)
[Developing a Clinical Language Model for Swedish: Continued Pretraining of Generic BERT with In-Domain Data](https://aclanthology.org/2021.ranlp-1.90) (Lamproudis et al., RANLP 2021)
ACL