001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.conll2005; 025 026import java.io.File; 027import java.io.FileNotFoundException; 028import java.io.PrintWriter; 029import java.util.ArrayList; 030import java.util.Collection; 031import java.util.List; 032 033import org.apache.uima.UimaContext; 034import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 035import org.apache.uima.jcas.JCas; 036import org.apache.uima.resource.ResourceInitializationException; 037import org.cleartk.srl.type.Argument; 038import org.cleartk.srl.type.Predicate; 039import org.cleartk.srl.type.SemanticArgument; 040import org.cleartk.token.type.Sentence; 041import org.cleartk.token.type.Token; 042import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 043import org.apache.uima.fit.descriptor.ConfigurationParameter; 044import org.apache.uima.fit.util.JCasUtil; 045 046/** 047 * <br> 048 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 049 * All rights reserved. 050 */ 051public class Conll2005Writer extends JCasAnnotator_ImplBase { 052 053 @ConfigurationParameter( 054 name = PARAM_OUTPUT_FILE, 055 mandatory = true, 056 description = "the path where the CoNLL-2005-formatted text should be written") 057 private File outputFile; 058 059 public static final String PARAM_OUTPUT_FILE = "outputFile"; 060 061 private PrintWriter output; 062 063 private boolean first; 064 065 @Override 066 public void initialize(UimaContext context) throws ResourceInitializationException { 067 super.initialize(context); 068 try { 069 this.output = new PrintWriter(outputFile); 070 this.first = true; 071 } catch (FileNotFoundException e) { 072 throw new ResourceInitializationException(e); 073 } 074 } 075 076 @Override 077 public void process(JCas jCas) throws AnalysisEngineProcessException { 078 Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class); 079 for (Sentence sentence : sentences) { 080 if (first) 081 first = false; 082 else 083 output.println(); 084 085 List<PredicateWriter> predicateWriters = new ArrayList<PredicateWriter>(); 086 for (Predicate predicate : JCasUtil.selectCovered(jCas, Predicate.class, sentence)) { 087 predicateWriters.add(new PredicateWriter(jCas, predicate)); 088 } 089 090 for (Token token : JCasUtil.selectCovered(jCas, Token.class, sentence)) { 091 Conll05Line line = new Conll05Line(); 092 093 // line.setLexeme(token.getCoveredText()); 094 // line.setPos(token.getPos()); 095 096 for (PredicateWriter predicateWriter : predicateWriters) { 097 predicateWriter.write(token, line); 098 } 099 100 output.println(line.evalString()); 101 } 102 103 } 104 output.flush(); 105 } 106 107 @Override 108 public void collectionProcessComplete() throws AnalysisEngineProcessException { 109 output.close(); 110 super.collectionProcessComplete(); 111 } 112 113 private static class Conll05Line { 114 // String lexeme; 115 // String pos; 116 // String syntaxSegment; 117 // String neSegment; 118 // String predicateFrameset; 119 String predicateBaseform; 120 121 List<String> argumentRoles; 122 123 public Conll05Line() { 124 // lexeme = "<empty>"; 125 // pos = "<empty>"; 126 // syntaxSegment = "*"; 127 // neSegment = "*"; 128 // predicateFrameset = "-"; 129 predicateBaseform = "-"; 130 argumentRoles = new ArrayList<String>(); 131 } 132 133 public String evalString() { 134 StringBuffer buffer = new StringBuffer(); 135 // buffer.append(lexeme); 136 // buffer.append("\t"); 137 // buffer.append(pos); 138 // buffer.append("\t"); 139 // buffer.append(syntaxSegment); 140 // buffer.append("\t"); 141 // buffer.append(neSegment); 142 // buffer.append("\t"); 143 // buffer.append("predicateFrameset"); 144 // buffer.append("\t"); 145 buffer.append(predicateBaseform); 146 147 for (String argumentRole : argumentRoles) { 148 buffer.append("\t"); 149 buffer.append(argumentRole); 150 } 151 152 return buffer.toString(); 153 } 154 155 // public void setLexeme(String lexeme) { 156 // this.lexeme = lexeme; 157 // } 158 // 159 // public void setPos(String pos) { 160 // this.pos = pos; 161 // } 162 163 // public void setSyntaxSegment(String syntaxSegment) { 164 // this.syntaxSegment = syntaxSegment; 165 // } 166 167 // public void setNeSegment(String neSegment) { 168 // this.neSegment = neSegment; 169 // } 170 171 public void setPredicateFrameset(Integer predicateFrameset) { 172 // if( predicateFrameset == null ) { 173 // this.predicateFrameset = "-"; 174 // } else { 175 // this.predicateFrameset = String.format("%2d", predicateFrameset); 176 // } 177 } 178 179 public void setPredicateBaseform(String predicateBaseform) { 180 if (predicateBaseform == null) { 181 this.predicateBaseform = "-"; 182 } else { 183 this.predicateBaseform = predicateBaseform; 184 } 185 } 186 187 public void addArgumentRole(String argumentRole) { 188 this.argumentRoles.add(argumentRole); 189 } 190 } 191 192 private static class PredicateWriter { 193 String baseform; 194 195 Integer frameset; 196 197 Token token; 198 199 List<ArgumentWriter> argumentWriters; 200 201 PredicateWriter(JCas jCas, Predicate predicate) { 202 this.token = JCasUtil.selectCovered(jCas, Token.class, predicate.getAnnotation()).get(0); 203 this.baseform = predicate.getBaseForm(); 204 this.frameset = 1; 205 206 Collection<Argument> allArgs = JCasUtil.select(predicate.getArguments(), Argument.class); 207 this.argumentWriters = new ArrayList<ArgumentWriter>(); 208 for (Argument arg : allArgs) { 209 if (arg instanceof SemanticArgument) { 210 this.argumentWriters.add(new ArgumentWriter(jCas, (SemanticArgument) arg)); 211 } 212 } 213 } 214 215 void write(Token tok, Conll05Line line) { 216 if (this.token.equals(tok)) { 217 line.setPredicateBaseform(this.baseform); 218 line.setPredicateFrameset(this.frameset); 219 } 220 221 line.addArgumentRole(getArgumentsString(tok)); 222 } 223 224 String getArgumentsString(Token tok) { 225 StringBuffer buffer = new StringBuffer(); 226 227 for (ArgumentWriter argumentWriter : this.argumentWriters) { 228 buffer.append(argumentWriter.getStartString(tok)); 229 } 230 231 buffer.append("*"); 232 233 for (ArgumentWriter argumentWriter : this.argumentWriters) { 234 buffer.append(argumentWriter.getEndString(tok)); 235 } 236 237 return buffer.toString(); 238 } 239 } 240 241 private static class ArgumentWriter { 242 String label; 243 244 String feature; 245 246 String preposition; 247 248 List<Token> tokens; 249 250 ArgumentWriter(JCas jCas, SemanticArgument arg) { 251 this.label = arg.getLabel(); 252 this.feature = arg.getFeature(); 253 this.preposition = arg.getPreposition(); 254 this.tokens = JCasUtil.selectCovered(jCas, Token.class, arg.getAnnotation()); 255 } 256 257 String getStartString(Token token) { 258 if (token == this.tokens.get(0)) 259 return "(" + getFullLabel(); 260 else 261 return ""; 262 } 263 264 String getEndString(Token token) { 265 if (token == this.tokens.get(this.tokens.size() - 1)) 266 return ")"; 267 else 268 return ""; 269 } 270 271 String getFullLabel() { 272 StringBuffer buffer = new StringBuffer(); 273 274 buffer.append(this.label); 275 if (this.feature != null) 276 buffer.append("-" + this.feature); 277 if (this.preposition != null) 278 buffer.append("-" + this.preposition); 279 280 return buffer.toString(); 281 } 282 } 283 284}