001/* 002 * Copyright (c) 2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.clearnlp; 025 026import java.util.List; 027import java.util.Map; 028 029import org.apache.uima.UimaContext; 030import org.apache.uima.analysis_engine.AnalysisEngineDescription; 031import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 032import org.apache.uima.jcas.JCas; 033import org.apache.uima.jcas.cas.TOP; 034import org.apache.uima.jcas.tcas.Annotation; 035import org.apache.uima.resource.ResourceInitializationException; 036import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 037import org.apache.uima.fit.descriptor.ConfigurationParameter; 038import org.apache.uima.fit.factory.AnalysisEngineFactory; 039import org.apache.uima.fit.util.JCasUtil; 040 041import com.google.common.annotations.Beta; 042import com.google.common.collect.Lists; 043import com.google.common.collect.Maps; 044import com.clearnlp.component.AbstractComponent; 045import com.clearnlp.dependency.DEPArc; 046import com.clearnlp.dependency.DEPLib; 047import com.clearnlp.dependency.DEPNode; 048import com.clearnlp.dependency.DEPTree; 049import com.clearnlp.dependency.srl.SRLArc; 050import com.clearnlp.nlp.NLPGetter; 051import com.clearnlp.nlp.NLPLib; 052import com.clearnlp.reader.AbstractReader; 053 054/** 055 * <br> 056 * Copyright (c) 2012, Regents of the University of Colorado <br> 057 * All rights reserved. 058 * <p> 059 * This class provides a UIMA/ClearTK wrapper for the ClearNLP semantic role labeler. A typical 060 * pipeline preceding this analysis engine would consist of a tokenizer, sentence segmenter, POS 061 * tagger, lemmatizer (mp analyzer), and dependency parser. 062 * <p> 063 * The ClearNLP labeler is available here: 064 * <p> 065 * http://clearnlp.googlecode.com 066 * <p> 067 * 068 * @author Lee Becker 069 * 070 */ 071@Beta 072public abstract class SemanticRoleLabeler_ImplBase<WINDOW_TYPE extends Annotation, TOKEN_TYPE extends Annotation, DEPENDENCY_NODE_TYPE extends TOP, DEPENDENCY_ROOT_NODE_TYPE extends DEPENDENCY_NODE_TYPE, DEPENDENCY_RELATION_TYPE extends TOP, ARGUMENT_TYPE extends TOP, PREDICATE_TYPE extends TOP> 073 extends JCasAnnotator_ImplBase { 074 075 /* 076 public static final String DEFAULT_PRED_ID_MODEL_FILE_NAME = "ontonotes-en-pred-1.3.0.tgz"; 077 078 public static final String DEFAULT_ROLESET_MODEL_FILE_NAME = "ontonotes-en-role-1.3.0.tgz"; 079 080 public static final String DEFAULT_SRL_MODEL_FILE_NAME = "ontonotes-en-srl-1.3.0.tgz"; 081 */ 082 083 public static final String DEFAULT_PRED_ID_MODEL_PATH = "general-en"; 084 085 public static final String DEFAULT_ROLESET_MODEL_PATH = "general-en"; 086 087 public static final String DEFAULT_SRL_MODEL_PATH = "general-en"; 088 089 public static final String PARAM_SRL_MODEL_PATH = "srlModelPath"; 090 @ConfigurationParameter( 091 name = PARAM_SRL_MODEL_PATH, 092 mandatory = false, 093 description = "This parameter provides the path pointing to the semantic role labeler model. If none is specified it will use the default ontonotes model.", 094 defaultValue=DEFAULT_SRL_MODEL_PATH) 095 private String srlModelPath; 096 097 public static final String PARAM_PRED_ID_MODEL_PATH = "predIdModelPath"; 098 @ConfigurationParameter( 099 name = PARAM_PRED_ID_MODEL_PATH, 100 mandatory = false, 101 description = "This parameter provides the path pointing to the predicate identifier model. If none is specified it will use the default ontonotes model.", 102 defaultValue=DEFAULT_PRED_ID_MODEL_PATH) 103 private String predIdModelPath; 104 105 public static final String PARAM_ROLESET_MODEL_PATH = "rolesetModelPath"; 106 107 @ConfigurationParameter( 108 name = PARAM_ROLESET_MODEL_PATH, 109 mandatory = false, 110 description = "This parameter provides the path pointing to the role set classifier model. If none is specified it will use the default ontonotes model.", 111 defaultValue=DEFAULT_ROLESET_MODEL_PATH) 112 private String rolesetModelPath; 113 114 public static final String PARAM_LANGUAGE_CODE = "languageCode"; 115 116 @ConfigurationParameter( 117 name = PARAM_LANGUAGE_CODE, 118 mandatory = false, 119 description = "Language code for the semantic role labeler (default value=en).", 120 defaultValue = AbstractReader.LANG_EN) 121 private String languageCode; 122 123 public static final String PARAM_WINDOW_CLASS = "windowClass"; 124 125 private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. " 126 + "By default, the tokenizer will tokenize a document sentence by sentence. If you do not want to precede tokenization with" 127 + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'"; 128 129 @ConfigurationParameter( 130 name = PARAM_WINDOW_CLASS, 131 mandatory = false, 132 description = WINDOW_TYPE_DESCRIPTION, 133 defaultValue = "org.cleartk.token.type.Sentence") 134 private Class<? extends WINDOW_TYPE> windowClass; 135 136 private TokenOps<TOKEN_TYPE> tokenOps; 137 138 private DependencyOps<DEPENDENCY_NODE_TYPE, TOKEN_TYPE, DEPENDENCY_ROOT_NODE_TYPE, WINDOW_TYPE, DEPENDENCY_RELATION_TYPE> dependencyOps; 139 140 private SemanticRoleOps<ARGUMENT_TYPE, TOKEN_TYPE, PREDICATE_TYPE, TOKEN_TYPE> srlOps; 141 142 public SemanticRoleLabeler_ImplBase( 143 TokenOps<TOKEN_TYPE> tokenOps, 144 DependencyOps<DEPENDENCY_NODE_TYPE, TOKEN_TYPE, DEPENDENCY_ROOT_NODE_TYPE, WINDOW_TYPE, DEPENDENCY_RELATION_TYPE> dependencyOps, 145 SemanticRoleOps<ARGUMENT_TYPE, TOKEN_TYPE, PREDICATE_TYPE, TOKEN_TYPE> srlOps) { 146 this.tokenOps = tokenOps; 147 this.dependencyOps = dependencyOps; 148 this.srlOps = srlOps; 149 } 150 151 @Override 152 public void initialize(UimaContext aContext) throws ResourceInitializationException { 153 super.initialize(aContext); 154 155 try { 156 this.predIdentifier = NLPGetter.getComponent( 157 this.predIdModelPath, 158 languageCode, 159 NLPLib.MODE_PRED); 160 161 this.roleSetClassifier = NLPGetter.getComponent( 162 this.rolesetModelPath, 163 languageCode, 164 NLPLib.MODE_ROLE); 165 166 this.srlabeler = NLPGetter.getComponent( 167 this.srlModelPath, 168 languageCode, 169 NLPLib.MODE_SRL); 170 171 } catch (Exception e) { 172 throw new ResourceInitializationException(e); 173 } 174 } 175 176 /** 177 * Convenience method for creating Analysis Engine for ClearNLP's dependency parser using default 178 * English model files 179 */ 180 public static AnalysisEngineDescription getDescription() throws ResourceInitializationException { 181 return AnalysisEngineFactory.createEngineDescription(SemanticRoleLabeler_ImplBase.class); 182 183 } 184 185 @Override 186 public void process(JCas jCas) throws AnalysisEngineProcessException { 187 188 for (WINDOW_TYPE window : JCasUtil.select(jCas, this.windowClass)) { 189 boolean skipSentence = false; 190 List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window); 191 List<String> tokenStrings = JCasUtil.toText(tokens); 192 DEPENDENCY_ROOT_NODE_TYPE dependencyRoot = this.dependencyOps.selectRootNode(jCas, window); 193 List<DEPENDENCY_NODE_TYPE> dependencyNodes = this.dependencyOps.selectNodes(jCas, window); 194 if (dependencyNodes.size() != tokens.size()) { 195 throw new IllegalArgumentException(String.format( 196 "Expected one dependency node per token; found %d tokens and %d dependency nodes", 197 tokens.size(), 198 dependencyNodes.size())); 199 } 200 201 // Build dependency tree from token information 202 DEPTree tree = NLPGetter.toDEPTree(tokenStrings); 203 // DEPTree tree = new DEPTree(); 204 for (int i = 1; i < tree.size(); i++) { 205 TOKEN_TYPE token = tokens.get(i - 1); 206 DEPNode node = tree.get(i); 207 node.pos = this.tokenOps.getPos(jCas, token); 208 node.lemma = this.tokenOps.getLemma(jCas, token); 209 } 210 211 // Build map between CAS dependency node and id for later creation of 212 // ClearParser dependency node/tree 213 Map<DEPENDENCY_NODE_TYPE, Integer> depNodeToID = Maps.newHashMap(); 214 depNodeToID.put(dependencyRoot, 0); 215 int nodeId = 1; 216 for (DEPENDENCY_NODE_TYPE depNode : dependencyNodes) { 217 depNodeToID.put(depNode, nodeId); 218 nodeId++; 219 } 220 221 // Initialize Dependency Relations for ClearNLP input 222 for (DEPENDENCY_NODE_TYPE casDepNode : dependencyNodes) { 223 List<DEPENDENCY_RELATION_TYPE> relations = this.dependencyOps.getHeadRelations( 224 jCas, 225 casDepNode); 226 if (relations.size() == 0) { 227 // In cases where the sentence is unparseable we are left with only a root node 228 // Thus the Semantic Role Labeler should skip this sentence 229 skipSentence = true; 230 } else if (relations.size() != 1) { 231 throw new IllegalArgumentException("Expected 1 head, found " + relations.size()); 232 } else { 233 for (DEPENDENCY_RELATION_TYPE relation : relations) { 234 DEPENDENCY_NODE_TYPE head = this.dependencyOps.getHead(jCas, relation); 235 String label = this.dependencyOps.getLabel(jCas, relation); 236 DEPNode node = tree.get(depNodeToID.get(casDepNode)); 237 DEPNode headNode = tree.get(depNodeToID.get(head)); 238 node.setHead(headNode, label); 239 } 240 } 241 } 242 243 // Run the SRL 244 if (!skipSentence) { 245 this.predIdentifier.process(tree); 246 this.roleSetClassifier.process(tree); 247 this.srlabeler.process(tree); 248 249 // Extract SRL information and create ClearTK CAS types 250 this.extractSRLInfo(jCas, tokens, tree); 251 } 252 } 253 } 254 255 /** 256 * Converts the output from the ClearParser Semantic Role Labeler to the ClearTK Predicate and 257 * SemanticArgument Types. 258 * 259 * @param jCas 260 * @param tokens 261 * - In order list of tokens 262 * @param tree 263 * - DepdendencyTree output by ClearParser SRLPredict 264 */ 265 private void extractSRLInfo(JCas jCas, List<TOKEN_TYPE> tokens, DEPTree tree) { 266 Map<Integer, PREDICATE_TYPE> headIdToPredicate = Maps.newHashMap(); 267 Map<PREDICATE_TYPE, List<ARGUMENT_TYPE>> predicateArguments = Maps.newHashMap(); 268 269 // Start at node 1, since node 0 is considered the head of the sentence 270 for (int i = 1; i < tree.size(); i++) { 271 // Every ClearParser parserNode will contain an srlInfo field. 272 DEPNode parserNode = tree.get(i); 273 TOKEN_TYPE token = tokens.get(i - 1); 274 275 List<SRLArc> semanticHeads = parserNode.getSHeads(); 276 if (semanticHeads.isEmpty()) { 277 continue; 278 } 279 280 // Parse semantic head relations to get SRL triplets 281 for (DEPArc shead : semanticHeads) { 282 int headId = shead.getNode().id; 283 TOKEN_TYPE headToken = tokens.get(headId - 1); 284 PREDICATE_TYPE pred; 285 List<ARGUMENT_TYPE> args; 286 if (!headIdToPredicate.containsKey(headId)) { 287 String rolesetId = shead.getNode().getFeat(DEPLib.FEAT_PB); 288 pred = this.srlOps.createPredicate(jCas, headToken, rolesetId); 289 headIdToPredicate.put(headId, pred); 290 args = Lists.newArrayList(); 291 predicateArguments.put(pred, args); 292 } else { 293 pred = headIdToPredicate.get(headId); 294 args = predicateArguments.get(pred); 295 } 296 args.add(this.srlOps.createArgument(jCas, token, shead.getLabel())); 297 } 298 } 299 300 // Store Arguments in Predicate 301 for (Map.Entry<PREDICATE_TYPE, List<ARGUMENT_TYPE>> entry : predicateArguments.entrySet()) { 302 PREDICATE_TYPE predicate = entry.getKey(); 303 List<ARGUMENT_TYPE> arguments = entry.getValue(); 304 this.srlOps.setPredicateArguments(jCas, predicate, arguments); 305 } 306 307 } 308 309 private AbstractComponent predIdentifier; 310 311 private AbstractComponent roleSetClassifier; 312 313 private AbstractComponent srlabeler; 314}