@inproceedings{koloski-etal-2023-reconstruct,
title = "Reconstruct to Retrieve: Identifying interesting news in a Cross-lingual setting",
author = "Koloski, Boshko and
Skrlj, Blaz and
Lavrac, Nada and
Pollak, Senja",
editor = "Breitholtz, Ellen and
Lappin, Shalom and
Loaiciga, Sharid and
Ilinykh, Nikolai and
Dobnik, Simon",
booktitle = "Proceedings of the 2023 CLASP Conference on Learning with Small Data (LSD)",
month = sep,
year = "2023",
address = "Gothenburg, Sweden",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.clasp-1.10/",
pages = "81--89",
abstract = "An important and resource-intensive task in journalism is retrieving relevant foreign news and its adaptation for local readers. Given the vast amount of foreign articles published and the limited number of journalists available to evaluate their interestingness, this task can be particularly challenging, especially when dealing with smaller languages and countries. In this work, we propose a novel method for large-scale retrieval of potentially translation-worthy articles based on an auto-encoder neural network trained on a limited corpus of relevant foreign news. We hypothesize that the representations of interesting news can be reconstructed very well by an auto-encoder, while irrelevant news would have less adequate reconstructions since they are not used for training the network. Specifically, we focus on extracting articles from the Latvian media for Estonian news media houses. It is worth noting that the available corpora for this task are particularly limited, which adds an extra layer of difficulty to our approach. To evaluate the proposed method, we rely on manual evaluation by an Estonian journalist at Ekspress Meedia and automatic evaluation on a gold standard test set."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="koloski-etal-2023-reconstruct">
<titleInfo>
<title>Reconstruct to Retrieve: Identifying interesting news in a Cross-lingual setting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Boshko</namePart>
<namePart type="family">Koloski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Blaz</namePart>
<namePart type="family">Skrlj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nada</namePart>
<namePart type="family">Lavrac</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Senja</namePart>
<namePart type="family">Pollak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 CLASP Conference on Learning with Small Data (LSD)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ellen</namePart>
<namePart type="family">Breitholtz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shalom</namePart>
<namePart type="family">Lappin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharid</namePart>
<namePart type="family">Loaiciga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gothenburg, Sweden</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>An important and resource-intensive task in journalism is retrieving relevant foreign news and its adaptation for local readers. Given the vast amount of foreign articles published and the limited number of journalists available to evaluate their interestingness, this task can be particularly challenging, especially when dealing with smaller languages and countries. In this work, we propose a novel method for large-scale retrieval of potentially translation-worthy articles based on an auto-encoder neural network trained on a limited corpus of relevant foreign news. We hypothesize that the representations of interesting news can be reconstructed very well by an auto-encoder, while irrelevant news would have less adequate reconstructions since they are not used for training the network. Specifically, we focus on extracting articles from the Latvian media for Estonian news media houses. It is worth noting that the available corpora for this task are particularly limited, which adds an extra layer of difficulty to our approach. To evaluate the proposed method, we rely on manual evaluation by an Estonian journalist at Ekspress Meedia and automatic evaluation on a gold standard test set.</abstract>
<identifier type="citekey">koloski-etal-2023-reconstruct</identifier>
<location>
<url>https://aclanthology.org/2023.clasp-1.10/</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>81</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reconstruct to Retrieve: Identifying interesting news in a Cross-lingual setting
%A Koloski, Boshko
%A Skrlj, Blaz
%A Lavrac, Nada
%A Pollak, Senja
%Y Breitholtz, Ellen
%Y Lappin, Shalom
%Y Loaiciga, Sharid
%Y Ilinykh, Nikolai
%Y Dobnik, Simon
%S Proceedings of the 2023 CLASP Conference on Learning with Small Data (LSD)
%D 2023
%8 September
%I Association for Computational Linguistics
%C Gothenburg, Sweden
%F koloski-etal-2023-reconstruct
%X An important and resource-intensive task in journalism is retrieving relevant foreign news and its adaptation for local readers. Given the vast amount of foreign articles published and the limited number of journalists available to evaluate their interestingness, this task can be particularly challenging, especially when dealing with smaller languages and countries. In this work, we propose a novel method for large-scale retrieval of potentially translation-worthy articles based on an auto-encoder neural network trained on a limited corpus of relevant foreign news. We hypothesize that the representations of interesting news can be reconstructed very well by an auto-encoder, while irrelevant news would have less adequate reconstructions since they are not used for training the network. Specifically, we focus on extracting articles from the Latvian media for Estonian news media houses. It is worth noting that the available corpora for this task are particularly limited, which adds an extra layer of difficulty to our approach. To evaluate the proposed method, we rely on manual evaluation by an Estonian journalist at Ekspress Meedia and automatic evaluation on a gold standard test set.
%U https://aclanthology.org/2023.clasp-1.10/
%P 81-89
Markdown (Informal)
[Reconstruct to Retrieve: Identifying interesting news in a Cross-lingual setting](https://aclanthology.org/2023.clasp-1.10/) (Koloski et al., CLASP 2023)
ACL