@inproceedings{loc-etal-2022-development,
title = "Development of a {M}ulti{M}odal Annotation Framework and Dataset for Deep Video Understanding",
author = "Loc, Erika and
Curtis, Keith and
Awad, George and
Rajput, Shahzad and
Soboroff, Ian",
editor = "Paggio, Patrizia and
Gatt, Albert and
Tanti, Marc",
booktitle = "Proceedings of the 2nd Workshop on People in Vision, Language, and the Mind",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.pvlam-1.3/",
pages = "12--16",
abstract = "In this paper we introduce our approach and methods for collecting and annotating a new dataset for deep video understanding. The proposed dataset is composed of 3 seasons (15 episodes) of the BBC Land Girls TV Series in addition to 14 Creative Common movies with total duration of 28.5 hr. The main contribution of this paper is a novel annotation framework on the movie and scene levels to support an automatic query generation process that can capture the high-level movie features (e.g. how characters and locations are related to each other) as well as fine grained scene-level features (e.g. character interactions, natural language descriptions, and sentiments). Movie-level annotations include constructing a global static knowledge graph (KG) to capture major relationships, while the scene-level annotations include constructing a sequence of knowledge graphs (KGs) to capture fine-grained features. The annotation framework supports generating multiple query types. The objective of the framework is to provide a guide to annotating long duration videos to support tasks and challenges in the video and multimedia understanding domains. These tasks and challenges can support testing automatic systems on their ability to learn and comprehend a movie or long video in terms of actors, entities, events, interactions and their relationship to each other."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="loc-etal-2022-development">
<titleInfo>
<title>Development of a MultiModal Annotation Framework and Dataset for Deep Video Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Erika</namePart>
<namePart type="family">Loc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keith</namePart>
<namePart type="family">Curtis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Awad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shahzad</namePart>
<namePart type="family">Rajput</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ian</namePart>
<namePart type="family">Soboroff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on People in Vision, Language, and the Mind</title>
</titleInfo>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Tanti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we introduce our approach and methods for collecting and annotating a new dataset for deep video understanding. The proposed dataset is composed of 3 seasons (15 episodes) of the BBC Land Girls TV Series in addition to 14 Creative Common movies with total duration of 28.5 hr. The main contribution of this paper is a novel annotation framework on the movie and scene levels to support an automatic query generation process that can capture the high-level movie features (e.g. how characters and locations are related to each other) as well as fine grained scene-level features (e.g. character interactions, natural language descriptions, and sentiments). Movie-level annotations include constructing a global static knowledge graph (KG) to capture major relationships, while the scene-level annotations include constructing a sequence of knowledge graphs (KGs) to capture fine-grained features. The annotation framework supports generating multiple query types. The objective of the framework is to provide a guide to annotating long duration videos to support tasks and challenges in the video and multimedia understanding domains. These tasks and challenges can support testing automatic systems on their ability to learn and comprehend a movie or long video in terms of actors, entities, events, interactions and their relationship to each other.</abstract>
<identifier type="citekey">loc-etal-2022-development</identifier>
<location>
<url>https://aclanthology.org/2022.pvlam-1.3/</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>12</start>
<end>16</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Development of a MultiModal Annotation Framework and Dataset for Deep Video Understanding
%A Loc, Erika
%A Curtis, Keith
%A Awad, George
%A Rajput, Shahzad
%A Soboroff, Ian
%Y Paggio, Patrizia
%Y Gatt, Albert
%Y Tanti, Marc
%S Proceedings of the 2nd Workshop on People in Vision, Language, and the Mind
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F loc-etal-2022-development
%X In this paper we introduce our approach and methods for collecting and annotating a new dataset for deep video understanding. The proposed dataset is composed of 3 seasons (15 episodes) of the BBC Land Girls TV Series in addition to 14 Creative Common movies with total duration of 28.5 hr. The main contribution of this paper is a novel annotation framework on the movie and scene levels to support an automatic query generation process that can capture the high-level movie features (e.g. how characters and locations are related to each other) as well as fine grained scene-level features (e.g. character interactions, natural language descriptions, and sentiments). Movie-level annotations include constructing a global static knowledge graph (KG) to capture major relationships, while the scene-level annotations include constructing a sequence of knowledge graphs (KGs) to capture fine-grained features. The annotation framework supports generating multiple query types. The objective of the framework is to provide a guide to annotating long duration videos to support tasks and challenges in the video and multimedia understanding domains. These tasks and challenges can support testing automatic systems on their ability to learn and comprehend a movie or long video in terms of actors, entities, events, interactions and their relationship to each other.
%U https://aclanthology.org/2022.pvlam-1.3/
%P 12-16
Markdown (Informal)
[Development of a MultiModal Annotation Framework and Dataset for Deep Video Understanding](https://aclanthology.org/2022.pvlam-1.3/) (Loc et al., PVLAM 2022)
ACL