@inproceedings{ganiev-etal-2021-architecture,
title = "An Architecture for Accelerated Large-Scale Inference of Transformer-Based Language Models",
author = "Ganiev, Amir and
Chapin, Colton and
De Andrade, Anderson and
Liu, Chen",
editor = "Kim, Young-bum and
Li, Yunyao and
Rambow, Owen",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-industry.21/",
doi = "10.18653/v1/2021.naacl-industry.21",
pages = "163--169",
abstract = "This work demonstrates the development process of a machine learning architecture for inference that can scale to a large volume of requests. We used a BERT model that was fine-tuned for emotion analysis, returning a probability distribution of emotions given a paragraph. The model was deployed as a gRPC service on Kubernetes. Apache Spark was used to perform inference in batches by calling the service. We encountered some performance and concurrency challenges and created solutions to achieve faster running time. Starting with 200 successful inference requests per minute, we were able to achieve as high as 18 thousand successful requests per minute with the same batch job resource allocation. As a result, we successfully stored emotion probabilities for 95 million paragraphs within 96 hours."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ganiev-etal-2021-architecture">
<titleInfo>
<title>An Architecture for Accelerated Large-Scale Inference of Transformer-Based Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Ganiev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Colton</namePart>
<namePart type="family">Chapin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anderson</namePart>
<namePart type="family">De Andrade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Young-bum</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This work demonstrates the development process of a machine learning architecture for inference that can scale to a large volume of requests. We used a BERT model that was fine-tuned for emotion analysis, returning a probability distribution of emotions given a paragraph. The model was deployed as a gRPC service on Kubernetes. Apache Spark was used to perform inference in batches by calling the service. We encountered some performance and concurrency challenges and created solutions to achieve faster running time. Starting with 200 successful inference requests per minute, we were able to achieve as high as 18 thousand successful requests per minute with the same batch job resource allocation. As a result, we successfully stored emotion probabilities for 95 million paragraphs within 96 hours.</abstract>
<identifier type="citekey">ganiev-etal-2021-architecture</identifier>
<identifier type="doi">10.18653/v1/2021.naacl-industry.21</identifier>
<location>
<url>https://aclanthology.org/2021.naacl-industry.21/</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>163</start>
<end>169</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Architecture for Accelerated Large-Scale Inference of Transformer-Based Language Models
%A Ganiev, Amir
%A Chapin, Colton
%A De Andrade, Anderson
%A Liu, Chen
%Y Kim, Young-bum
%Y Li, Yunyao
%Y Rambow, Owen
%S Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F ganiev-etal-2021-architecture
%X This work demonstrates the development process of a machine learning architecture for inference that can scale to a large volume of requests. We used a BERT model that was fine-tuned for emotion analysis, returning a probability distribution of emotions given a paragraph. The model was deployed as a gRPC service on Kubernetes. Apache Spark was used to perform inference in batches by calling the service. We encountered some performance and concurrency challenges and created solutions to achieve faster running time. Starting with 200 successful inference requests per minute, we were able to achieve as high as 18 thousand successful requests per minute with the same batch job resource allocation. As a result, we successfully stored emotion probabilities for 95 million paragraphs within 96 hours.
%R 10.18653/v1/2021.naacl-industry.21
%U https://aclanthology.org/2021.naacl-industry.21/
%U https://doi.org/10.18653/v1/2021.naacl-industry.21
%P 163-169
Markdown (Informal)
[An Architecture for Accelerated Large-Scale Inference of Transformer-Based Language Models](https://aclanthology.org/2021.naacl-industry.21/) (Ganiev et al., NAACL 2021)
ACL