001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.propbank.util; 025 026import java.util.ArrayList; 027import java.util.Collections; 028import java.util.List; 029 030import org.apache.uima.jcas.JCas; 031import org.apache.uima.jcas.cas.FSArray; 032import org.cleartk.srl.type.Argument; 033import org.cleartk.srl.type.Predicate; 034import org.cleartk.syntax.constituent.type.TopTreebankNode; 035import org.cleartk.token.type.Sentence; 036import org.apache.uima.fit.util.FSCollectionFactory; 037 038import com.google.common.annotations.Beta; 039 040/** 041 * <br> 042 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 043 * All rights reserved. 044 * 045 * 046 * 047 * <p> 048 * A <em>Propbank object</em> represents one entry in Propbank. It closely reflects the structure of 049 * one line in <tt>prop.txt</tt>. 050 * </p> 051 * 052 * @author Philipp Wetzler, Philip Ogren, Steven Bethard 053 */ 054@Beta 055public class Propbank { 056 /** 057 * Parses one Propbank entry and returns its representation as a <em>Propbank</em> object. 058 * 059 * @param propTxt 060 * one line from <tt>prop.txt</tt> 061 * 062 * @return a <em>Propbank</em> object representing <b>propTxt</b> 063 */ 064 public static Propbank fromString(String propTxt) { 065 String[] columns = propTxt.split(" "); 066 Propbank propbank = new Propbank(); 067 try { 068 propbank.setPropTxt(propTxt); 069 // set filename, sentence number and predicate leaf number 070 propbank.setFilename(columns[0]); 071 propbank.setSentenceNumber(Integer.parseInt(columns[1])); 072 propbank.setTerminal(PropbankRelation.fromString(columns[2])); 073 074 // NomBank format - labels start in column 5 075 int labelsStart; 076 if (columns[5].indexOf(':') >= 0) { 077 labelsStart = 5; 078 propbank.setBaseForm(columns[3]); 079 propbank.setFrameSet(columns[4]); 080 } 081 082 // PropBank format - labels start in column 6 083 else { 084 labelsStart = 6; 085 propbank.setTaggerName(columns[3]); 086 String[] baseFormAndFrameSet = columns[4].split("\\."); 087 propbank.setBaseForm(baseFormAndFrameSet[0]); 088 propbank.setFrameSet(baseFormAndFrameSet[1]); 089 propbank.setInflectionValue(columns[5]); 090 } 091 092 // set each of the labels 093 for (int i = labelsStart; i < columns.length; i++) { 094 propbank.addProplabel(Proplabel.fromString(columns[i])); 095 } 096 097 } catch (ArrayIndexOutOfBoundsException e) { 098 throw new PropbankFormatException("invalid Propbank entry: " + propTxt); 099 } catch (NumberFormatException e) { 100 throw new PropbankFormatException("invalid Propbank entry: " + propTxt); 101 } 102 return propbank; 103 } 104 105 /** 106 * A convenience funtion to quickly read only the filename portion of a line from 107 * <tt>prop.txt</tt>. 108 * 109 * @param propTxt 110 * one line from <tt>prop.txt</tt> 111 * 112 * @return the filename part of <b>propTxt</b> 113 */ 114 public static String filenameFromString(String propTxt) { 115 return propTxt.split(" ")[0]; 116 } 117 118 protected String filename; 119 120 protected int sentenceNumber; 121 122 protected PropbankRelation terminal; 123 124 protected String taggerName; 125 126 protected String baseForm; 127 128 protected String frameSet; 129 130 protected String inflectionValue; 131 132 protected List<Proplabel> proplabels; 133 134 protected String propTxt; 135 136 public String getPropTxt() { 137 return propTxt; 138 } 139 140 public void setPropTxt(String propTxt) { 141 this.propTxt = propTxt; 142 } 143 144 protected Propbank() { 145 proplabels = new ArrayList<Proplabel>(); 146 } 147 148 public String getFilename() { 149 return filename; 150 } 151 152 public void setFilename(String filename) { 153 this.filename = filename; 154 } 155 156 public String getBaseForm() { 157 return baseForm; 158 } 159 160 public void setBaseForm(String baseForm) { 161 this.baseForm = baseForm; 162 } 163 164 public String getFrameSet() { 165 return frameSet; 166 } 167 168 public void setFrameSet(String frameSet) { 169 this.frameSet = frameSet; 170 } 171 172 public String getInflectionValue() { 173 return inflectionValue; 174 } 175 176 public void setInflectionValue(String inflectionValue) { 177 this.inflectionValue = inflectionValue; 178 } 179 180 public int getSentenceNumber() { 181 return sentenceNumber; 182 } 183 184 public void setSentenceNumber(int sentenceNumber) { 185 this.sentenceNumber = sentenceNumber; 186 } 187 188 public String getTaggerName() { 189 return taggerName; 190 } 191 192 public void setTaggerName(String taggerName) { 193 this.taggerName = taggerName; 194 } 195 196 public PropbankRelation getTerminal() { 197 return this.terminal; 198 } 199 200 public void setTerminal(PropbankRelation terminal) { 201 this.terminal = terminal; 202 } 203 204 public List<Proplabel> getPropLabels() { 205 return Collections.unmodifiableList(proplabels); 206 } 207 208 public void setPropLabels(List<Proplabel> proplabels) { 209 this.proplabels.clear(); 210 if (proplabels != null) { 211 this.proplabels.addAll(proplabels); 212 } 213 } 214 215 public void addProplabel(Proplabel proplabel) { 216 this.proplabels.add(proplabel); 217 } 218 219 /** 220 * Convert to ClearTK <em>Predicate</em> / <em>SemanticArgument</em> annotations and add them to 221 * <b>view</b>. 222 * 223 * @param view 224 * the view where the annotations should be added 225 * @param topNode 226 * the top node annotation of the corresponding Treebank parse 227 * @param sentence 228 * the sentence annotation of the corresponding sentence 229 * @return the generated <em>Predicate</em> annotation 230 */ 231 public Predicate convert(JCas view, TopTreebankNode topNode, Sentence sentence) { 232 Predicate p = new Predicate(view); 233 p.setPropTxt(this.propTxt); 234 p.setAnnotation(this.terminal.convert(view, topNode)); 235 p.setBegin(p.getAnnotation().getBegin()); 236 p.setEnd(p.getAnnotation().getEnd()); 237 p.setSentence(sentence); 238 p.setFrameSet(this.frameSet); 239 p.setBaseForm(this.baseForm); 240 241 List<Argument> aList = new ArrayList<Argument>(); 242 for (Proplabel proplabel : this.proplabels) { 243 aList.add(proplabel.convert(view, topNode)); 244 } 245 p.setArguments(new FSArray(view, aList.size())); 246 FSCollectionFactory.fillArrayFS(p.getArguments(), aList); 247 p.addToIndexes(); 248 249 return p; 250 } 251 252 /** 253 * Generate an easily readable multi-line description of this Propbank entry. 254 */ 255 public String displayText() { 256 StringBuffer text = new StringBuffer(String.format( 257 "filename = %s\n" + "sentence number = %s\n" + "terminal = %s\n" + "base form = %s\n" 258 + "frame set = %s\n" + "tagger = %s\n" + "inflection value = %s\n", 259 this.getFilename(), 260 this.getSentenceNumber(), 261 this.getTerminal(), 262 this.getBaseForm(), 263 this.getFrameSet(), 264 this.getTaggerName(), 265 this.getInflectionValue())); 266 for (Proplabel label : getPropLabels()) { 267 text.append(String.format( 268 "proplabel = %s %s\n" + "text = %s\n", 269 label.getLabel(), 270 label.getFeature(), 271 label.getRelation())); 272 } 273 return text.toString(); 274 } 275 276 /** 277 * Re-generate the Propbank format line that this object was parsed from. 278 */ 279 @Override 280 public String toString() { 281 StringBuffer buffer = new StringBuffer(); 282 283 buffer.append(String.format( 284 "%s %s %s", 285 this.getFilename(), 286 this.getSentenceNumber(), 287 this.getTerminal())); 288 // NomBank 289 if (this.getTaggerName() == null) { 290 buffer.append(String.format(" %s %s", this.getBaseForm(), this.getFrameSet())); 291 } 292 // PropBank 293 else { 294 buffer.append(String.format( 295 " %s %s.%s %s", 296 this.getTaggerName(), 297 this.getBaseForm(), 298 this.getFrameSet(), 299 this.getInflectionValue())); 300 } 301 302 for (Proplabel label : getPropLabels()) { 303 buffer.append(' '); 304 buffer.append(label); 305 } 306 307 return buffer.toString(); 308 } 309 310}