@inproceedings{mandikal-2024-ancient,
title = "Ancient Wisdom, Modern Tools: Exploring Retrieval-Augmented {LLM}s for {A}ncient {I}ndian Philosophy",
author = "Mandikal, Priyanka",
editor = "Pavlopoulos, John and
Sommerschield, Thea and
Assael, Yannis and
Gordin, Shai and
Cho, Kyunghyun and
Passarotti, Marco and
Sprugnoli, Rachele and
Liu, Yudong and
Li, Bin and
Anderson, Adam",
booktitle = "Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)",
month = aug,
year = "2024",
address = "Hybrid in Bangkok, Thailand and online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.ml4al-1.23/",
doi = "10.18653/v1/2024.ml4al-1.23",
pages = "224--250",
abstract = "LLMs have revolutionized the landscape of information retrieval and knowledge dissemination. However, their application in specialized areas is often hindered by limitations such as factual inaccuracies and hallucinations, especially in long-tail knowledge distributions. In this work, we explore the potential of retrieval-augmented generation (RAG) models in performing long-form question answering (LFQA) on a specially curated niche and custom knowledge domain. We present VedantaNY-10M, a dataset curated from extensive public discourses on the ancient Indian philosophy of Advaita Vedanta. We develop and benchmark a RAG model against a standard, non-RAG LLM, focusing on transcription, retrieval, and generation performance. A human evaluation involving computational linguists and domain experts, shows that the RAG model significantly outperforms the standard model in producing factual, comprehensive responses having fewer hallucinations. In addition, we find that a keyword-based hybrid retriever that focuses on unique low-frequency words further improves results. Our study provides insights into the future development of real-world RAG models for custom and niche areas of knowledge."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mandikal-2024-ancient">
<titleInfo>
<title>Ancient Wisdom, Modern Tools: Exploring Retrieval-Augmented LLMs for Ancient Indian Philosophy</title>
</titleInfo>
<name type="personal">
<namePart type="given">Priyanka</namePart>
<namePart type="family">Mandikal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Pavlopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thea</namePart>
<namePart type="family">Sommerschield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yannis</namePart>
<namePart type="family">Assael</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shai</namePart>
<namePart type="family">Gordin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyunghyun</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Passarotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yudong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Anderson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid in Bangkok, Thailand and online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>LLMs have revolutionized the landscape of information retrieval and knowledge dissemination. However, their application in specialized areas is often hindered by limitations such as factual inaccuracies and hallucinations, especially in long-tail knowledge distributions. In this work, we explore the potential of retrieval-augmented generation (RAG) models in performing long-form question answering (LFQA) on a specially curated niche and custom knowledge domain. We present VedantaNY-10M, a dataset curated from extensive public discourses on the ancient Indian philosophy of Advaita Vedanta. We develop and benchmark a RAG model against a standard, non-RAG LLM, focusing on transcription, retrieval, and generation performance. A human evaluation involving computational linguists and domain experts, shows that the RAG model significantly outperforms the standard model in producing factual, comprehensive responses having fewer hallucinations. In addition, we find that a keyword-based hybrid retriever that focuses on unique low-frequency words further improves results. Our study provides insights into the future development of real-world RAG models for custom and niche areas of knowledge.</abstract>
<identifier type="citekey">mandikal-2024-ancient</identifier>
<identifier type="doi">10.18653/v1/2024.ml4al-1.23</identifier>
<location>
<url>https://aclanthology.org/2024.ml4al-1.23/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>224</start>
<end>250</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Ancient Wisdom, Modern Tools: Exploring Retrieval-Augmented LLMs for Ancient Indian Philosophy
%A Mandikal, Priyanka
%Y Pavlopoulos, John
%Y Sommerschield, Thea
%Y Assael, Yannis
%Y Gordin, Shai
%Y Cho, Kyunghyun
%Y Passarotti, Marco
%Y Sprugnoli, Rachele
%Y Liu, Yudong
%Y Li, Bin
%Y Anderson, Adam
%S Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Hybrid in Bangkok, Thailand and online
%F mandikal-2024-ancient
%X LLMs have revolutionized the landscape of information retrieval and knowledge dissemination. However, their application in specialized areas is often hindered by limitations such as factual inaccuracies and hallucinations, especially in long-tail knowledge distributions. In this work, we explore the potential of retrieval-augmented generation (RAG) models in performing long-form question answering (LFQA) on a specially curated niche and custom knowledge domain. We present VedantaNY-10M, a dataset curated from extensive public discourses on the ancient Indian philosophy of Advaita Vedanta. We develop and benchmark a RAG model against a standard, non-RAG LLM, focusing on transcription, retrieval, and generation performance. A human evaluation involving computational linguists and domain experts, shows that the RAG model significantly outperforms the standard model in producing factual, comprehensive responses having fewer hallucinations. In addition, we find that a keyword-based hybrid retriever that focuses on unique low-frequency words further improves results. Our study provides insights into the future development of real-world RAG models for custom and niche areas of knowledge.
%R 10.18653/v1/2024.ml4al-1.23
%U https://aclanthology.org/2024.ml4al-1.23/
%U https://doi.org/10.18653/v1/2024.ml4al-1.23
%P 224-250
Markdown (Informal)
[Ancient Wisdom, Modern Tools: Exploring Retrieval-Augmented LLMs for Ancient Indian Philosophy](https://aclanthology.org/2024.ml4al-1.23/) (Mandikal, ML4AL 2024)
ACL