001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.propbank.util;
025
026import java.util.ArrayList;
027import java.util.Collections;
028import java.util.List;
029
030import org.apache.uima.jcas.JCas;
031import org.apache.uima.jcas.cas.FSArray;
032import org.cleartk.srl.type.Argument;
033import org.cleartk.srl.type.Predicate;
034import org.cleartk.syntax.constituent.type.TopTreebankNode;
035import org.cleartk.token.type.Sentence;
036import org.apache.uima.fit.util.FSCollectionFactory;
037
038import com.google.common.annotations.Beta;
039
040/**
041 * <br>
042 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
043 * All rights reserved.
044 * 
045 * 
046 * 
047 * <p>
048 * A <em>Propbank object</em> represents one entry in Propbank. It closely reflects the structure of
049 * one line in <tt>prop.txt</tt>.
050 * </p>
051 * 
052 * @author Philipp Wetzler, Philip Ogren, Steven Bethard
053 */
054@Beta
055public class Propbank {
056  /**
057   * Parses one Propbank entry and returns its representation as a <em>Propbank</em> object.
058   * 
059   * @param propTxt
060   *          one line from <tt>prop.txt</tt>
061   * 
062   * @return a <em>Propbank</em> object representing <b>propTxt</b>
063   */
064  public static Propbank fromString(String propTxt) {
065    String[] columns = propTxt.split(" ");
066    Propbank propbank = new Propbank();
067    try {
068      propbank.setPropTxt(propTxt);
069      // set filename, sentence number and predicate leaf number
070      propbank.setFilename(columns[0]);
071      propbank.setSentenceNumber(Integer.parseInt(columns[1]));
072      propbank.setTerminal(PropbankRelation.fromString(columns[2]));
073
074      // NomBank format - labels start in column 5
075      int labelsStart;
076      if (columns[5].indexOf(':') >= 0) {
077        labelsStart = 5;
078        propbank.setBaseForm(columns[3]);
079        propbank.setFrameSet(columns[4]);
080      }
081
082      // PropBank format - labels start in column 6
083      else {
084        labelsStart = 6;
085        propbank.setTaggerName(columns[3]);
086        String[] baseFormAndFrameSet = columns[4].split("\\.");
087        propbank.setBaseForm(baseFormAndFrameSet[0]);
088        propbank.setFrameSet(baseFormAndFrameSet[1]);
089        propbank.setInflectionValue(columns[5]);
090      }
091
092      // set each of the labels
093      for (int i = labelsStart; i < columns.length; i++) {
094        propbank.addProplabel(Proplabel.fromString(columns[i]));
095      }
096
097    } catch (ArrayIndexOutOfBoundsException e) {
098      throw new PropbankFormatException("invalid Propbank entry: " + propTxt);
099    } catch (NumberFormatException e) {
100      throw new PropbankFormatException("invalid Propbank entry: " + propTxt);
101    }
102    return propbank;
103  }
104
105  /**
106   * A convenience funtion to quickly read only the filename portion of a line from
107   * <tt>prop.txt</tt>.
108   * 
109   * @param propTxt
110   *          one line from <tt>prop.txt</tt>
111   * 
112   * @return the filename part of <b>propTxt</b>
113   */
114  public static String filenameFromString(String propTxt) {
115    return propTxt.split(" ")[0];
116  }
117
118  protected String filename;
119
120  protected int sentenceNumber;
121
122  protected PropbankRelation terminal;
123
124  protected String taggerName;
125
126  protected String baseForm;
127
128  protected String frameSet;
129
130  protected String inflectionValue;
131
132  protected List<Proplabel> proplabels;
133
134  protected String propTxt;
135
136  public String getPropTxt() {
137    return propTxt;
138  }
139
140  public void setPropTxt(String propTxt) {
141    this.propTxt = propTxt;
142  }
143
144  protected Propbank() {
145    proplabels = new ArrayList<Proplabel>();
146  }
147
148  public String getFilename() {
149    return filename;
150  }
151
152  public void setFilename(String filename) {
153    this.filename = filename;
154  }
155
156  public String getBaseForm() {
157    return baseForm;
158  }
159
160  public void setBaseForm(String baseForm) {
161    this.baseForm = baseForm;
162  }
163
164  public String getFrameSet() {
165    return frameSet;
166  }
167
168  public void setFrameSet(String frameSet) {
169    this.frameSet = frameSet;
170  }
171
172  public String getInflectionValue() {
173    return inflectionValue;
174  }
175
176  public void setInflectionValue(String inflectionValue) {
177    this.inflectionValue = inflectionValue;
178  }
179
180  public int getSentenceNumber() {
181    return sentenceNumber;
182  }
183
184  public void setSentenceNumber(int sentenceNumber) {
185    this.sentenceNumber = sentenceNumber;
186  }
187
188  public String getTaggerName() {
189    return taggerName;
190  }
191
192  public void setTaggerName(String taggerName) {
193    this.taggerName = taggerName;
194  }
195
196  public PropbankRelation getTerminal() {
197    return this.terminal;
198  }
199
200  public void setTerminal(PropbankRelation terminal) {
201    this.terminal = terminal;
202  }
203
204  public List<Proplabel> getPropLabels() {
205    return Collections.unmodifiableList(proplabels);
206  }
207
208  public void setPropLabels(List<Proplabel> proplabels) {
209    this.proplabels.clear();
210    if (proplabels != null) {
211      this.proplabels.addAll(proplabels);
212    }
213  }
214
215  public void addProplabel(Proplabel proplabel) {
216    this.proplabels.add(proplabel);
217  }
218
219  /**
220   * Convert to ClearTK <em>Predicate</em> / <em>SemanticArgument</em> annotations and add them to
221   * <b>view</b>.
222   * 
223   * @param view
224   *          the view where the annotations should be added
225   * @param topNode
226   *          the top node annotation of the corresponding Treebank parse
227   * @param sentence
228   *          the sentence annotation of the corresponding sentence
229   * @return the generated <em>Predicate</em> annotation
230   */
231  public Predicate convert(JCas view, TopTreebankNode topNode, Sentence sentence) {
232    Predicate p = new Predicate(view);
233    p.setPropTxt(this.propTxt);
234    p.setAnnotation(this.terminal.convert(view, topNode));
235    p.setBegin(p.getAnnotation().getBegin());
236    p.setEnd(p.getAnnotation().getEnd());
237    p.setSentence(sentence);
238    p.setFrameSet(this.frameSet);
239    p.setBaseForm(this.baseForm);
240
241    List<Argument> aList = new ArrayList<Argument>();
242    for (Proplabel proplabel : this.proplabels) {
243      aList.add(proplabel.convert(view, topNode));
244    }
245    p.setArguments(new FSArray(view, aList.size()));
246    FSCollectionFactory.fillArrayFS(p.getArguments(), aList);
247    p.addToIndexes();
248
249    return p;
250  }
251
252  /**
253   * Generate an easily readable multi-line description of this Propbank entry.
254   */
255  public String displayText() {
256    StringBuffer text = new StringBuffer(String.format(
257        "filename = %s\n" + "sentence number = %s\n" + "terminal = %s\n" + "base form = %s\n"
258            + "frame set = %s\n" + "tagger = %s\n" + "inflection value = %s\n",
259        this.getFilename(),
260        this.getSentenceNumber(),
261        this.getTerminal(),
262        this.getBaseForm(),
263        this.getFrameSet(),
264        this.getTaggerName(),
265        this.getInflectionValue()));
266    for (Proplabel label : getPropLabels()) {
267      text.append(String.format(
268          "proplabel = %s %s\n" + "text = %s\n",
269          label.getLabel(),
270          label.getFeature(),
271          label.getRelation()));
272    }
273    return text.toString();
274  }
275
276  /**
277   * Re-generate the Propbank format line that this object was parsed from.
278   */
279  @Override
280  public String toString() {
281    StringBuffer buffer = new StringBuffer();
282
283    buffer.append(String.format(
284        "%s %s %s",
285        this.getFilename(),
286        this.getSentenceNumber(),
287        this.getTerminal()));
288    // NomBank
289    if (this.getTaggerName() == null) {
290      buffer.append(String.format(" %s %s", this.getBaseForm(), this.getFrameSet()));
291    }
292    // PropBank
293    else {
294      buffer.append(String.format(
295          " %s %s.%s %s",
296          this.getTaggerName(),
297          this.getBaseForm(),
298          this.getFrameSet(),
299          this.getInflectionValue()));
300    }
301
302    for (Proplabel label : getPropLabels()) {
303      buffer.append(' ');
304      buffer.append(label);
305    }
306
307    return buffer.toString();
308  }
309
310}