@inproceedings{kim-wu-2024-knowlabs,
title = "Knowlab`s Submission to {L}+{M} Shared Task: All you need is continued pretraining of chemistry texts even for molecule captioning",
author = "Kim, Yunsoo and
Wu, Honghan",
editor = "Edwards, Carl and
Wang, Qingyun and
Li, Manling and
Zhao, Lawrence and
Hope, Tom and
Ji, Heng",
booktitle = "Proceedings of the 1st Workshop on Language + Molecules (L+M 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.langmol-1.11/",
doi = "10.18653/v1/2024.langmol-1.11",
pages = "91--96",
abstract = "This paper presents our submission to the L+M-24 shared task, focused on translating molecular structures into natural language descriptions, known as the molecule captioning task. We selected a small language model (SLM), Phi-3-mini-4k, to evaluate the impact of continued pretraining and instruction tuning for domain-specific chemical knowledge. The Phi-3 model was continued pretrained with 90M chemistry textbooks and abstracts, followed by instruction tuning on 150K question answering sets of SMILES and general chemistry knowledge. Despite the continued pretraining phase not including direct exposure to SMILES representations, it significantly enhanced the Phi-3 model`s performance, a 300{\%} increase for the BLEU scores, in the molecule captioning task. The code and model are released at \url{https://github.com/bluesky333/Phi3KnowChem} to facilitate research in chemical small language modeling."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-wu-2024-knowlabs">
<titleInfo>
<title>Knowlab‘s Submission to L+M Shared Task: All you need is continued pretraining of chemistry texts even for molecule captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunsoo</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Honghan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Language + Molecules (L+M 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carl</namePart>
<namePart type="family">Edwards</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingyun</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lawrence</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Hope</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents our submission to the L+M-24 shared task, focused on translating molecular structures into natural language descriptions, known as the molecule captioning task. We selected a small language model (SLM), Phi-3-mini-4k, to evaluate the impact of continued pretraining and instruction tuning for domain-specific chemical knowledge. The Phi-3 model was continued pretrained with 90M chemistry textbooks and abstracts, followed by instruction tuning on 150K question answering sets of SMILES and general chemistry knowledge. Despite the continued pretraining phase not including direct exposure to SMILES representations, it significantly enhanced the Phi-3 model‘s performance, a 300% increase for the BLEU scores, in the molecule captioning task. The code and model are released at https://github.com/bluesky333/Phi3KnowChem to facilitate research in chemical small language modeling.</abstract>
<identifier type="citekey">kim-wu-2024-knowlabs</identifier>
<identifier type="doi">10.18653/v1/2024.langmol-1.11</identifier>
<location>
<url>https://aclanthology.org/2024.langmol-1.11/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>91</start>
<end>96</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Knowlab‘s Submission to L+M Shared Task: All you need is continued pretraining of chemistry texts even for molecule captioning
%A Kim, Yunsoo
%A Wu, Honghan
%Y Edwards, Carl
%Y Wang, Qingyun
%Y Li, Manling
%Y Zhao, Lawrence
%Y Hope, Tom
%Y Ji, Heng
%S Proceedings of the 1st Workshop on Language + Molecules (L+M 2024)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F kim-wu-2024-knowlabs
%X This paper presents our submission to the L+M-24 shared task, focused on translating molecular structures into natural language descriptions, known as the molecule captioning task. We selected a small language model (SLM), Phi-3-mini-4k, to evaluate the impact of continued pretraining and instruction tuning for domain-specific chemical knowledge. The Phi-3 model was continued pretrained with 90M chemistry textbooks and abstracts, followed by instruction tuning on 150K question answering sets of SMILES and general chemistry knowledge. Despite the continued pretraining phase not including direct exposure to SMILES representations, it significantly enhanced the Phi-3 model‘s performance, a 300% increase for the BLEU scores, in the molecule captioning task. The code and model are released at https://github.com/bluesky333/Phi3KnowChem to facilitate research in chemical small language modeling.
%R 10.18653/v1/2024.langmol-1.11
%U https://aclanthology.org/2024.langmol-1.11/
%U https://doi.org/10.18653/v1/2024.langmol-1.11
%P 91-96
Markdown (Informal)
[Knowlab’s Submission to L+M Shared Task: All you need is continued pretraining of chemistry texts even for molecule captioning](https://aclanthology.org/2024.langmol-1.11/) (Kim & Wu, LangMol 2024)
ACL