@inproceedings{bhattacharjee-etal-2024-indus,
title = "{INDUS}: Effective and Efficient Language Models for Scientific Applications",
author = "Bhattacharjee, Bishwaranjan and
Trivedi, Aashka and
Muraoka, Masayasu and
Ramasubramanian, Muthukumaran and
Udagawa, Takuma and
Gurung, Iksha and
Pantha, Nishan and
Zhang, Rong and
Dandala, Bharath and
Ramachandran, Rahul and
Maskey, Manil and
Bugbee, Kaylin and
Little, Michael M. and
Fancher, Elizabeth and
Gerasimov, Irina and
Mehrabian, Armin and
Sanders, Lauren and
Costes, Sylvain V. and
Blanco-Cuaresma, Sergi and
Lockhart, Kelly and
Allen, Thomas and
Grezes, Felix and
Ansdell, Megan and
Accomazzi, Alberto and
El-Kurdi, Yousef and
Wertheimer, Davis and
Pfitzmann, Birgit and
Berrospi Ramis, Cesar and
Dolfi, Michele and
De Lima, Rafael Teixeira and
Vagenas, Panagiotis and
Mukkavilli, S. Karthik and
Staar, Peter W. J. and
Vahidinia, Sanaz and
McGranaghan, Ryan and
Lee, Tsengdar J.",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.9",
doi = "10.18653/v1/2024.emnlp-industry.9",
pages = "98--112",
abstract = "Large language models (LLMs) trained on general domain corpora showed remarkable results on natural language processing (NLP) tasks. However, previous research demonstrated LLMs trained using domain-focused corpora perform better on specialized tasks. Inspired by this insight, we developed INDUS, a comprehensive suite of LLMs tailored for the closely-related domains of Earth science, biology, physics, heliophysics, planetary sciences and astrophysics, and trained using curated scientific corpora drawn from diverse data sources. The suite of models include: (1) an encoder model trained using domain-specific vocabulary and corpora to address NLP tasks, (2) a contrastive-learning based text embedding model trained using a diverse set of datasets to address information retrieval tasks and (3) smaller versions of these models created using knowledge distillation for applications which have latency or resource constraints. We also created three new scientific benchmark datasets, Climate-Change NER (entity-recognition), NASA-QA (extractive QA) and NASA-IR (IR) to accelerate research in these multi-disciplinary fields. We show that our models outperform both general-purpose (RoBERTa) and domain- specific (SciBERT) encoders on these new tasks as well as existing tasks in the domains of interest. Furthermore, we demonstrate the use of these models in two industrial settings- as a retrieval model for large-scale vector search applications and in automatic content tagging systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bhattacharjee-etal-2024-indus">
<titleInfo>
<title>INDUS: Effective and Efficient Language Models for Scientific Applications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bishwaranjan</namePart>
<namePart type="family">Bhattacharjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aashka</namePart>
<namePart type="family">Trivedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masayasu</namePart>
<namePart type="family">Muraoka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muthukumaran</namePart>
<namePart type="family">Ramasubramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takuma</namePart>
<namePart type="family">Udagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iksha</namePart>
<namePart type="family">Gurung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nishan</namePart>
<namePart type="family">Pantha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bharath</namePart>
<namePart type="family">Dandala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Ramachandran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manil</namePart>
<namePart type="family">Maskey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaylin</namePart>
<namePart type="family">Bugbee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Little</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Fancher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Gerasimov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Armin</namePart>
<namePart type="family">Mehrabian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lauren</namePart>
<namePart type="family">Sanders</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sylvain</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Costes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergi</namePart>
<namePart type="family">Blanco-Cuaresma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Lockhart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Allen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Grezes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Megan</namePart>
<namePart type="family">Ansdell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Accomazzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yousef</namePart>
<namePart type="family">El-Kurdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Davis</namePart>
<namePart type="family">Wertheimer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Birgit</namePart>
<namePart type="family">Pfitzmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cesar</namePart>
<namePart type="family">Berrospi Ramis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michele</namePart>
<namePart type="family">Dolfi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafael</namePart>
<namePart type="given">Teixeira</namePart>
<namePart type="family">De Lima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Panagiotis</namePart>
<namePart type="family">Vagenas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">S</namePart>
<namePart type="given">Karthik</namePart>
<namePart type="family">Mukkavilli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="given">W</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Staar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanaz</namePart>
<namePart type="family">Vahidinia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">McGranaghan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsengdar</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) trained on general domain corpora showed remarkable results on natural language processing (NLP) tasks. However, previous research demonstrated LLMs trained using domain-focused corpora perform better on specialized tasks. Inspired by this insight, we developed INDUS, a comprehensive suite of LLMs tailored for the closely-related domains of Earth science, biology, physics, heliophysics, planetary sciences and astrophysics, and trained using curated scientific corpora drawn from diverse data sources. The suite of models include: (1) an encoder model trained using domain-specific vocabulary and corpora to address NLP tasks, (2) a contrastive-learning based text embedding model trained using a diverse set of datasets to address information retrieval tasks and (3) smaller versions of these models created using knowledge distillation for applications which have latency or resource constraints. We also created three new scientific benchmark datasets, Climate-Change NER (entity-recognition), NASA-QA (extractive QA) and NASA-IR (IR) to accelerate research in these multi-disciplinary fields. We show that our models outperform both general-purpose (RoBERTa) and domain- specific (SciBERT) encoders on these new tasks as well as existing tasks in the domains of interest. Furthermore, we demonstrate the use of these models in two industrial settings- as a retrieval model for large-scale vector search applications and in automatic content tagging systems.</abstract>
<identifier type="citekey">bhattacharjee-etal-2024-indus</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-industry.9</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.9</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>98</start>
<end>112</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T INDUS: Effective and Efficient Language Models for Scientific Applications
%A Bhattacharjee, Bishwaranjan
%A Trivedi, Aashka
%A Muraoka, Masayasu
%A Ramasubramanian, Muthukumaran
%A Udagawa, Takuma
%A Gurung, Iksha
%A Pantha, Nishan
%A Zhang, Rong
%A Dandala, Bharath
%A Ramachandran, Rahul
%A Maskey, Manil
%A Bugbee, Kaylin
%A Little, Michael M.
%A Fancher, Elizabeth
%A Gerasimov, Irina
%A Mehrabian, Armin
%A Sanders, Lauren
%A Costes, Sylvain V.
%A Blanco-Cuaresma, Sergi
%A Lockhart, Kelly
%A Allen, Thomas
%A Grezes, Felix
%A Ansdell, Megan
%A Accomazzi, Alberto
%A El-Kurdi, Yousef
%A Wertheimer, Davis
%A Pfitzmann, Birgit
%A Berrospi Ramis, Cesar
%A Dolfi, Michele
%A De Lima, Rafael Teixeira
%A Vagenas, Panagiotis
%A Mukkavilli, S. Karthik
%A Staar, Peter W. J.
%A Vahidinia, Sanaz
%A McGranaghan, Ryan
%A Lee, Tsengdar J.
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F bhattacharjee-etal-2024-indus
%X Large language models (LLMs) trained on general domain corpora showed remarkable results on natural language processing (NLP) tasks. However, previous research demonstrated LLMs trained using domain-focused corpora perform better on specialized tasks. Inspired by this insight, we developed INDUS, a comprehensive suite of LLMs tailored for the closely-related domains of Earth science, biology, physics, heliophysics, planetary sciences and astrophysics, and trained using curated scientific corpora drawn from diverse data sources. The suite of models include: (1) an encoder model trained using domain-specific vocabulary and corpora to address NLP tasks, (2) a contrastive-learning based text embedding model trained using a diverse set of datasets to address information retrieval tasks and (3) smaller versions of these models created using knowledge distillation for applications which have latency or resource constraints. We also created three new scientific benchmark datasets, Climate-Change NER (entity-recognition), NASA-QA (extractive QA) and NASA-IR (IR) to accelerate research in these multi-disciplinary fields. We show that our models outperform both general-purpose (RoBERTa) and domain- specific (SciBERT) encoders on these new tasks as well as existing tasks in the domains of interest. Furthermore, we demonstrate the use of these models in two industrial settings- as a retrieval model for large-scale vector search applications and in automatic content tagging systems.
%R 10.18653/v1/2024.emnlp-industry.9
%U https://aclanthology.org/2024.emnlp-industry.9
%U https://doi.org/10.18653/v1/2024.emnlp-industry.9
%P 98-112
Markdown (Informal)
[INDUS: Effective and Efficient Language Models for Scientific Applications](https://aclanthology.org/2024.emnlp-industry.9) (Bhattacharjee et al., EMNLP 2024)
ACL
- Bishwaranjan Bhattacharjee, Aashka Trivedi, Masayasu Muraoka, Muthukumaran Ramasubramanian, Takuma Udagawa, Iksha Gurung, Nishan Pantha, Rong Zhang, Bharath Dandala, Rahul Ramachandran, Manil Maskey, Kaylin Bugbee, Michael M. Little, Elizabeth Fancher, Irina Gerasimov, Armin Mehrabian, Lauren Sanders, Sylvain V. Costes, Sergi Blanco-Cuaresma, et al.. 2024. INDUS: Effective and Efficient Language Models for Scientific Applications. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 98–112, Miami, Florida, US. Association for Computational Linguistics.