@inproceedings{gunther-etal-2023-jina,
title = "{J}ina Embeddings: A Novel Set of High-Performance Sentence Embedding Models",
author = {G{\"u}nther, Michael and
Milliken, Louis and
Geuter, Jonathan and
Mastrapas, Georgios and
Wang, Bo and
Xiao, Han},
editor = "Tan, Liling and
Milajevs, Dmitrijs and
Chauhan, Geeticka and
Gwinnup, Jeremy and
Rippeth, Elijah",
booktitle = "Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.nlposs-1.2/",
doi = "10.18653/v1/2023.nlposs-1.2",
pages = "8--18",
abstract = "Jina Embeddings constitutes a set of high-performance sentence embedding models adept at translating textual inputs into numerical representations, capturing the semantics of the text. These models excel in applications like dense retrieval and semantic textual similarity. This paper details the development of Jina Embeddings, starting with the creation of high-quality pairwise and triplet datasets.It underlines the crucial role of data cleaning in dataset preparation, offers in-depth insights into the model training process, and concludes with a comprehensive performance evaluation using the Massive Text Embedding Benchmark (MTEB). Furthermore, to increase the model`s awareness of grammatical negation, we construct a novel training and evaluation dataset of negated and non-negated statements, which we make publicly available to the community."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gunther-etal-2023-jina">
<titleInfo>
<title>Jina Embeddings: A Novel Set of High-Performance Sentence Embedding Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Günther</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis</namePart>
<namePart type="family">Milliken</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Geuter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georgios</namePart>
<namePart type="family">Mastrapas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Liling</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dmitrijs</namePart>
<namePart type="family">Milajevs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Geeticka</namePart>
<namePart type="family">Chauhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Gwinnup</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elijah</namePart>
<namePart type="family">Rippeth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Jina Embeddings constitutes a set of high-performance sentence embedding models adept at translating textual inputs into numerical representations, capturing the semantics of the text. These models excel in applications like dense retrieval and semantic textual similarity. This paper details the development of Jina Embeddings, starting with the creation of high-quality pairwise and triplet datasets.It underlines the crucial role of data cleaning in dataset preparation, offers in-depth insights into the model training process, and concludes with a comprehensive performance evaluation using the Massive Text Embedding Benchmark (MTEB). Furthermore, to increase the model‘s awareness of grammatical negation, we construct a novel training and evaluation dataset of negated and non-negated statements, which we make publicly available to the community.</abstract>
<identifier type="citekey">gunther-etal-2023-jina</identifier>
<identifier type="doi">10.18653/v1/2023.nlposs-1.2</identifier>
<location>
<url>https://aclanthology.org/2023.nlposs-1.2/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>8</start>
<end>18</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Jina Embeddings: A Novel Set of High-Performance Sentence Embedding Models
%A Günther, Michael
%A Milliken, Louis
%A Geuter, Jonathan
%A Mastrapas, Georgios
%A Wang, Bo
%A Xiao, Han
%Y Tan, Liling
%Y Milajevs, Dmitrijs
%Y Chauhan, Geeticka
%Y Gwinnup, Jeremy
%Y Rippeth, Elijah
%S Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F gunther-etal-2023-jina
%X Jina Embeddings constitutes a set of high-performance sentence embedding models adept at translating textual inputs into numerical representations, capturing the semantics of the text. These models excel in applications like dense retrieval and semantic textual similarity. This paper details the development of Jina Embeddings, starting with the creation of high-quality pairwise and triplet datasets.It underlines the crucial role of data cleaning in dataset preparation, offers in-depth insights into the model training process, and concludes with a comprehensive performance evaluation using the Massive Text Embedding Benchmark (MTEB). Furthermore, to increase the model‘s awareness of grammatical negation, we construct a novel training and evaluation dataset of negated and non-negated statements, which we make publicly available to the community.
%R 10.18653/v1/2023.nlposs-1.2
%U https://aclanthology.org/2023.nlposs-1.2/
%U https://doi.org/10.18653/v1/2023.nlposs-1.2
%P 8-18
Markdown (Informal)
[Jina Embeddings: A Novel Set of High-Performance Sentence Embedding Models](https://aclanthology.org/2023.nlposs-1.2/) (Günther et al., NLPOSS 2023)
ACL