001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.conll2005;
025
026import java.io.File;
027import java.io.FileNotFoundException;
028import java.io.PrintWriter;
029import java.util.ArrayList;
030import java.util.Collection;
031import java.util.List;
032
033import org.apache.uima.UimaContext;
034import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
035import org.apache.uima.jcas.JCas;
036import org.apache.uima.resource.ResourceInitializationException;
037import org.cleartk.srl.type.Argument;
038import org.cleartk.srl.type.Predicate;
039import org.cleartk.srl.type.SemanticArgument;
040import org.cleartk.token.type.Sentence;
041import org.cleartk.token.type.Token;
042import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
043import org.apache.uima.fit.descriptor.ConfigurationParameter;
044import org.apache.uima.fit.util.JCasUtil;
045
046/**
047 * <br>
048 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
049 * All rights reserved.
050 */
051public class Conll2005Writer extends JCasAnnotator_ImplBase {
052
053  @ConfigurationParameter(
054      name = PARAM_OUTPUT_FILE,
055      mandatory = true,
056      description = "the path where the CoNLL-2005-formatted text should be written")
057  private File outputFile;
058
059  public static final String PARAM_OUTPUT_FILE = "outputFile";
060
061  private PrintWriter output;
062
063  private boolean first;
064
065  @Override
066  public void initialize(UimaContext context) throws ResourceInitializationException {
067    super.initialize(context);
068    try {
069      this.output = new PrintWriter(outputFile);
070      this.first = true;
071    } catch (FileNotFoundException e) {
072      throw new ResourceInitializationException(e);
073    }
074  }
075
076  @Override
077  public void process(JCas jCas) throws AnalysisEngineProcessException {
078    Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
079    for (Sentence sentence : sentences) {
080      if (first)
081        first = false;
082      else
083        output.println();
084
085      List<PredicateWriter> predicateWriters = new ArrayList<PredicateWriter>();
086      for (Predicate predicate : JCasUtil.selectCovered(jCas, Predicate.class, sentence)) {
087        predicateWriters.add(new PredicateWriter(jCas, predicate));
088      }
089
090      for (Token token : JCasUtil.selectCovered(jCas, Token.class, sentence)) {
091        Conll05Line line = new Conll05Line();
092
093        // line.setLexeme(token.getCoveredText());
094        // line.setPos(token.getPos());
095
096        for (PredicateWriter predicateWriter : predicateWriters) {
097          predicateWriter.write(token, line);
098        }
099
100        output.println(line.evalString());
101      }
102
103    }
104    output.flush();
105  }
106
107  @Override
108  public void collectionProcessComplete() throws AnalysisEngineProcessException {
109    output.close();
110    super.collectionProcessComplete();
111  }
112
113  private static class Conll05Line {
114    // String lexeme;
115    // String pos;
116    // String syntaxSegment;
117    // String neSegment;
118    // String predicateFrameset;
119    String predicateBaseform;
120
121    List<String> argumentRoles;
122
123    public Conll05Line() {
124      // lexeme = "<empty>";
125      // pos = "<empty>";
126      // syntaxSegment = "*";
127      // neSegment = "*";
128      // predicateFrameset = "-";
129      predicateBaseform = "-";
130      argumentRoles = new ArrayList<String>();
131    }
132
133    public String evalString() {
134      StringBuffer buffer = new StringBuffer();
135      // buffer.append(lexeme);
136      // buffer.append("\t");
137      // buffer.append(pos);
138      // buffer.append("\t");
139      // buffer.append(syntaxSegment);
140      // buffer.append("\t");
141      // buffer.append(neSegment);
142      // buffer.append("\t");
143      // buffer.append("predicateFrameset");
144      // buffer.append("\t");
145      buffer.append(predicateBaseform);
146
147      for (String argumentRole : argumentRoles) {
148        buffer.append("\t");
149        buffer.append(argumentRole);
150      }
151
152      return buffer.toString();
153    }
154
155    // public void setLexeme(String lexeme) {
156    // this.lexeme = lexeme;
157    // }
158    //
159    // public void setPos(String pos) {
160    // this.pos = pos;
161    // }
162
163    // public void setSyntaxSegment(String syntaxSegment) {
164    // this.syntaxSegment = syntaxSegment;
165    // }
166
167    // public void setNeSegment(String neSegment) {
168    // this.neSegment = neSegment;
169    // }
170
171    public void setPredicateFrameset(Integer predicateFrameset) {
172      // if( predicateFrameset == null ) {
173      // this.predicateFrameset = "-";
174      // } else {
175      // this.predicateFrameset = String.format("%2d", predicateFrameset);
176      // }
177    }
178
179    public void setPredicateBaseform(String predicateBaseform) {
180      if (predicateBaseform == null) {
181        this.predicateBaseform = "-";
182      } else {
183        this.predicateBaseform = predicateBaseform;
184      }
185    }
186
187    public void addArgumentRole(String argumentRole) {
188      this.argumentRoles.add(argumentRole);
189    }
190  }
191
192  private static class PredicateWriter {
193    String baseform;
194
195    Integer frameset;
196
197    Token token;
198
199    List<ArgumentWriter> argumentWriters;
200
201    PredicateWriter(JCas jCas, Predicate predicate) {
202      this.token = JCasUtil.selectCovered(jCas, Token.class, predicate.getAnnotation()).get(0);
203      this.baseform = predicate.getBaseForm();
204      this.frameset = 1;
205
206      Collection<Argument> allArgs = JCasUtil.select(predicate.getArguments(), Argument.class);
207      this.argumentWriters = new ArrayList<ArgumentWriter>();
208      for (Argument arg : allArgs) {
209        if (arg instanceof SemanticArgument) {
210          this.argumentWriters.add(new ArgumentWriter(jCas, (SemanticArgument) arg));
211        }
212      }
213    }
214
215    void write(Token tok, Conll05Line line) {
216      if (this.token.equals(tok)) {
217        line.setPredicateBaseform(this.baseform);
218        line.setPredicateFrameset(this.frameset);
219      }
220
221      line.addArgumentRole(getArgumentsString(tok));
222    }
223
224    String getArgumentsString(Token tok) {
225      StringBuffer buffer = new StringBuffer();
226
227      for (ArgumentWriter argumentWriter : this.argumentWriters) {
228        buffer.append(argumentWriter.getStartString(tok));
229      }
230
231      buffer.append("*");
232
233      for (ArgumentWriter argumentWriter : this.argumentWriters) {
234        buffer.append(argumentWriter.getEndString(tok));
235      }
236
237      return buffer.toString();
238    }
239  }
240
241  private static class ArgumentWriter {
242    String label;
243
244    String feature;
245
246    String preposition;
247
248    List<Token> tokens;
249
250    ArgumentWriter(JCas jCas, SemanticArgument arg) {
251      this.label = arg.getLabel();
252      this.feature = arg.getFeature();
253      this.preposition = arg.getPreposition();
254      this.tokens = JCasUtil.selectCovered(jCas, Token.class, arg.getAnnotation());
255    }
256
257    String getStartString(Token token) {
258      if (token == this.tokens.get(0))
259        return "(" + getFullLabel();
260      else
261        return "";
262    }
263
264    String getEndString(Token token) {
265      if (token == this.tokens.get(this.tokens.size() - 1))
266        return ")";
267      else
268        return "";
269    }
270
271    String getFullLabel() {
272      StringBuffer buffer = new StringBuffer();
273
274      buffer.append(this.label);
275      if (this.feature != null)
276        buffer.append("-" + this.feature);
277      if (this.preposition != null)
278        buffer.append("-" + this.preposition);
279
280      return buffer.toString();
281    }
282  }
283
284}