@inproceedings{oinam-etal-2018-treebank,
title = "A Treebank for the Healthcare Domain",
author = "Oinam, Nganthoibi and
Mishra, Diwakar and
Patel, Pinal and
Choudhary, Narayan and
Desai, Hitesh",
editor = "Savary, Agata and
Ramisch, Carlos and
Hwang, Jena D. and
Schneider, Nathan and
Andresen, Melanie and
Pradhan, Sameer and
Petruck, Miriam R. L.",
booktitle = "Proceedings of the Joint Workshop on Linguistic Annotation, Multiword Expressions and Constructions ({LAW}-{MWE}-{C}x{G}-2018)",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-4916",
pages = "144--155",
abstract = "This paper presents a treebank for the healthcare domain developed at ezDI. The treebank is created from a wide array of clinical health record documents across hospitals. The data has been de-identified and annotated for constituent syntactic structure. The treebank contains a total of 52053 sentences that have been sampled for subdomains as well as linguistic variations. The paper outlines the sampling process followed to ensure a better domain representation in the corpus, the annotation process and challenges, and corpus statistics. The Penn Treebank tagset and guidelines were largely followed, but there were many syntactic contexts that warranted adaptation of the guidelines. The treebank created was used to re-train the Berkeley parser and the Stanford parser. These parsers were also trained with the GENIA treebank for comparative quality assessment. Our treebank yielded great-er accuracy on both parsers. Berkeley parser performed better on our treebank with an average F1 measure of 91 across 5-folds. This was a significant jump from the out-of-the-box F1 score of 70 on Berkeley parser{'}s default grammar.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="oinam-etal-2018-treebank">
<titleInfo>
<title>A Treebank for the Healthcare Domain</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nganthoibi</namePart>
<namePart type="family">Oinam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diwakar</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinal</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Narayan</namePart>
<namePart type="family">Choudhary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitesh</namePart>
<namePart type="family">Desai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint Workshop on Linguistic Annotation, Multiword Expressions and Constructions (LAW-MWE-CxG-2018)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Agata</namePart>
<namePart type="family">Savary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="family">Ramisch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jena</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathan</namePart>
<namePart type="family">Schneider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Melanie</namePart>
<namePart type="family">Andresen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sameer</namePart>
<namePart type="family">Pradhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miriam</namePart>
<namePart type="given">R</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Petruck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Santa Fe, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents a treebank for the healthcare domain developed at ezDI. The treebank is created from a wide array of clinical health record documents across hospitals. The data has been de-identified and annotated for constituent syntactic structure. The treebank contains a total of 52053 sentences that have been sampled for subdomains as well as linguistic variations. The paper outlines the sampling process followed to ensure a better domain representation in the corpus, the annotation process and challenges, and corpus statistics. The Penn Treebank tagset and guidelines were largely followed, but there were many syntactic contexts that warranted adaptation of the guidelines. The treebank created was used to re-train the Berkeley parser and the Stanford parser. These parsers were also trained with the GENIA treebank for comparative quality assessment. Our treebank yielded great-er accuracy on both parsers. Berkeley parser performed better on our treebank with an average F1 measure of 91 across 5-folds. This was a significant jump from the out-of-the-box F1 score of 70 on Berkeley parser’s default grammar.</abstract>
<identifier type="citekey">oinam-etal-2018-treebank</identifier>
<location>
<url>https://aclanthology.org/W18-4916</url>
</location>
<part>
<date>2018-08</date>
<extent unit="page">
<start>144</start>
<end>155</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Treebank for the Healthcare Domain
%A Oinam, Nganthoibi
%A Mishra, Diwakar
%A Patel, Pinal
%A Choudhary, Narayan
%A Desai, Hitesh
%Y Savary, Agata
%Y Ramisch, Carlos
%Y Hwang, Jena D.
%Y Schneider, Nathan
%Y Andresen, Melanie
%Y Pradhan, Sameer
%Y Petruck, Miriam R. L.
%S Proceedings of the Joint Workshop on Linguistic Annotation, Multiword Expressions and Constructions (LAW-MWE-CxG-2018)
%D 2018
%8 August
%I Association for Computational Linguistics
%C Santa Fe, New Mexico, USA
%F oinam-etal-2018-treebank
%X This paper presents a treebank for the healthcare domain developed at ezDI. The treebank is created from a wide array of clinical health record documents across hospitals. The data has been de-identified and annotated for constituent syntactic structure. The treebank contains a total of 52053 sentences that have been sampled for subdomains as well as linguistic variations. The paper outlines the sampling process followed to ensure a better domain representation in the corpus, the annotation process and challenges, and corpus statistics. The Penn Treebank tagset and guidelines were largely followed, but there were many syntactic contexts that warranted adaptation of the guidelines. The treebank created was used to re-train the Berkeley parser and the Stanford parser. These parsers were also trained with the GENIA treebank for comparative quality assessment. Our treebank yielded great-er accuracy on both parsers. Berkeley parser performed better on our treebank with an average F1 measure of 91 across 5-folds. This was a significant jump from the out-of-the-box F1 score of 70 on Berkeley parser’s default grammar.
%U https://aclanthology.org/W18-4916
%P 144-155
Markdown (Informal)
[A Treebank for the Healthcare Domain](https://aclanthology.org/W18-4916) (Oinam et al., LAW-MWE 2018)
ACL
- Nganthoibi Oinam, Diwakar Mishra, Pinal Patel, Narayan Choudhary, and Hitesh Desai. 2018. A Treebank for the Healthcare Domain. In Proceedings of the Joint Workshop on Linguistic Annotation, Multiword Expressions and Constructions (LAW-MWE-CxG-2018), pages 144–155, Santa Fe, New Mexico, USA. Association for Computational Linguistics.