001/*
002 * Copyright (c) 2012, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.clearnlp;
025
026import java.util.List;
027import java.util.Map;
028
029import org.apache.uima.UimaContext;
030import org.apache.uima.analysis_engine.AnalysisEngineDescription;
031import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
032import org.apache.uima.jcas.JCas;
033import org.apache.uima.jcas.cas.TOP;
034import org.apache.uima.jcas.tcas.Annotation;
035import org.apache.uima.resource.ResourceInitializationException;
036import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
037import org.apache.uima.fit.descriptor.ConfigurationParameter;
038import org.apache.uima.fit.factory.AnalysisEngineFactory;
039import org.apache.uima.fit.util.JCasUtil;
040
041import com.google.common.annotations.Beta;
042import com.google.common.collect.Lists;
043import com.google.common.collect.Maps;
044import com.clearnlp.component.AbstractComponent;
045import com.clearnlp.dependency.DEPArc;
046import com.clearnlp.dependency.DEPLib;
047import com.clearnlp.dependency.DEPNode;
048import com.clearnlp.dependency.DEPTree;
049import com.clearnlp.dependency.srl.SRLArc;
050import com.clearnlp.nlp.NLPGetter;
051import com.clearnlp.nlp.NLPLib;
052import com.clearnlp.reader.AbstractReader;
053
054/**
055 * <br>
056 * Copyright (c) 2012, Regents of the University of Colorado <br>
057 * All rights reserved.
058 * <p>
059 * This class provides a UIMA/ClearTK wrapper for the ClearNLP semantic role labeler. A typical
060 * pipeline preceding this analysis engine would consist of a tokenizer, sentence segmenter, POS
061 * tagger, lemmatizer (mp analyzer), and dependency parser.
062 * <p>
063 * The ClearNLP labeler is available here:
064 * <p>
065 * http://clearnlp.googlecode.com
066 * <p>
067 * 
068 * @author Lee Becker
069 * 
070 */
071@Beta
072public abstract class SemanticRoleLabeler_ImplBase<WINDOW_TYPE extends Annotation, TOKEN_TYPE extends Annotation, DEPENDENCY_NODE_TYPE extends TOP, DEPENDENCY_ROOT_NODE_TYPE extends DEPENDENCY_NODE_TYPE, DEPENDENCY_RELATION_TYPE extends TOP, ARGUMENT_TYPE extends TOP, PREDICATE_TYPE extends TOP>
073    extends JCasAnnotator_ImplBase {
074
075  /*
076  public static final String DEFAULT_PRED_ID_MODEL_FILE_NAME = "ontonotes-en-pred-1.3.0.tgz";
077
078  public static final String DEFAULT_ROLESET_MODEL_FILE_NAME = "ontonotes-en-role-1.3.0.tgz";
079
080  public static final String DEFAULT_SRL_MODEL_FILE_NAME = "ontonotes-en-srl-1.3.0.tgz";
081  */
082
083  public static final String DEFAULT_PRED_ID_MODEL_PATH = "general-en";
084
085  public static final String DEFAULT_ROLESET_MODEL_PATH = "general-en";
086
087  public static final String DEFAULT_SRL_MODEL_PATH = "general-en";
088
089  public static final String PARAM_SRL_MODEL_PATH = "srlModelPath";
090  @ConfigurationParameter(
091      name = PARAM_SRL_MODEL_PATH,
092      mandatory = false,
093      description = "This parameter provides the path pointing to the semantic role labeler model.  If none is specified it will use the default ontonotes model.",
094      defaultValue=DEFAULT_SRL_MODEL_PATH)
095  private String srlModelPath;
096
097  public static final String PARAM_PRED_ID_MODEL_PATH = "predIdModelPath";
098  @ConfigurationParameter(
099      name = PARAM_PRED_ID_MODEL_PATH,
100      mandatory = false,
101      description = "This parameter provides the path pointing to the predicate identifier model.  If none is specified it will use the default ontonotes model.",
102      defaultValue=DEFAULT_PRED_ID_MODEL_PATH)
103  private String predIdModelPath;
104
105  public static final String PARAM_ROLESET_MODEL_PATH = "rolesetModelPath";
106
107  @ConfigurationParameter(
108      name = PARAM_ROLESET_MODEL_PATH,
109      mandatory = false,
110      description = "This parameter provides the path pointing to the role set classifier model.  If none is specified it will use the default ontonotes model.",
111      defaultValue=DEFAULT_ROLESET_MODEL_PATH)
112  private String rolesetModelPath;
113
114  public static final String PARAM_LANGUAGE_CODE = "languageCode";
115
116  @ConfigurationParameter(
117      name = PARAM_LANGUAGE_CODE,
118      mandatory = false,
119      description = "Language code for the semantic role labeler (default value=en).",
120      defaultValue = AbstractReader.LANG_EN)
121  private String languageCode;
122
123  public static final String PARAM_WINDOW_CLASS = "windowClass";
124
125  private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. "
126      + "By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization with"
127      + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'";
128
129  @ConfigurationParameter(
130      name = PARAM_WINDOW_CLASS,
131      mandatory = false,
132      description = WINDOW_TYPE_DESCRIPTION,
133      defaultValue = "org.cleartk.token.type.Sentence")
134  private Class<? extends WINDOW_TYPE> windowClass;
135
136  private TokenOps<TOKEN_TYPE> tokenOps;
137
138  private DependencyOps<DEPENDENCY_NODE_TYPE, TOKEN_TYPE, DEPENDENCY_ROOT_NODE_TYPE, WINDOW_TYPE, DEPENDENCY_RELATION_TYPE> dependencyOps;
139
140  private SemanticRoleOps<ARGUMENT_TYPE, TOKEN_TYPE, PREDICATE_TYPE, TOKEN_TYPE> srlOps;
141
142  public SemanticRoleLabeler_ImplBase(
143      TokenOps<TOKEN_TYPE> tokenOps,
144      DependencyOps<DEPENDENCY_NODE_TYPE, TOKEN_TYPE, DEPENDENCY_ROOT_NODE_TYPE, WINDOW_TYPE, DEPENDENCY_RELATION_TYPE> dependencyOps,
145      SemanticRoleOps<ARGUMENT_TYPE, TOKEN_TYPE, PREDICATE_TYPE, TOKEN_TYPE> srlOps) {
146    this.tokenOps = tokenOps;
147    this.dependencyOps = dependencyOps;
148    this.srlOps = srlOps;
149  }
150
151  @Override
152  public void initialize(UimaContext aContext) throws ResourceInitializationException {
153    super.initialize(aContext);
154
155    try {
156      this.predIdentifier = NLPGetter.getComponent(
157          this.predIdModelPath,
158          languageCode,
159          NLPLib.MODE_PRED);
160
161      this.roleSetClassifier = NLPGetter.getComponent(
162          this.rolesetModelPath,
163          languageCode,
164          NLPLib.MODE_ROLE);
165
166      this.srlabeler = NLPGetter.getComponent(
167          this.srlModelPath,
168          languageCode,
169          NLPLib.MODE_SRL);
170
171    } catch (Exception e) {
172      throw new ResourceInitializationException(e);
173    }
174  }
175
176  /**
177   * Convenience method for creating Analysis Engine for ClearNLP's dependency parser using default
178   * English model files
179   */
180  public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
181    return AnalysisEngineFactory.createEngineDescription(SemanticRoleLabeler_ImplBase.class);
182
183  }
184
185  @Override
186  public void process(JCas jCas) throws AnalysisEngineProcessException {
187
188    for (WINDOW_TYPE window : JCasUtil.select(jCas, this.windowClass)) {
189      boolean skipSentence = false;
190      List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window);
191      List<String> tokenStrings = JCasUtil.toText(tokens);
192      DEPENDENCY_ROOT_NODE_TYPE dependencyRoot = this.dependencyOps.selectRootNode(jCas, window);
193      List<DEPENDENCY_NODE_TYPE> dependencyNodes = this.dependencyOps.selectNodes(jCas, window);
194      if (dependencyNodes.size() != tokens.size()) {
195        throw new IllegalArgumentException(String.format(
196            "Expected one dependency node per token; found %d tokens and %d dependency nodes",
197            tokens.size(),
198            dependencyNodes.size()));
199      }
200
201      // Build dependency tree from token information
202      DEPTree tree = NLPGetter.toDEPTree(tokenStrings);
203      // DEPTree tree = new DEPTree();
204      for (int i = 1; i < tree.size(); i++) {
205        TOKEN_TYPE token = tokens.get(i - 1);
206        DEPNode node = tree.get(i);
207        node.pos = this.tokenOps.getPos(jCas, token);
208        node.lemma = this.tokenOps.getLemma(jCas, token);
209      }
210
211      // Build map between CAS dependency node and id for later creation of
212      // ClearParser dependency node/tree
213      Map<DEPENDENCY_NODE_TYPE, Integer> depNodeToID = Maps.newHashMap();
214      depNodeToID.put(dependencyRoot, 0);
215      int nodeId = 1;
216      for (DEPENDENCY_NODE_TYPE depNode : dependencyNodes) {
217        depNodeToID.put(depNode, nodeId);
218        nodeId++;
219      }
220
221      // Initialize Dependency Relations for ClearNLP input
222      for (DEPENDENCY_NODE_TYPE casDepNode : dependencyNodes) {
223        List<DEPENDENCY_RELATION_TYPE> relations = this.dependencyOps.getHeadRelations(
224            jCas,
225            casDepNode);
226        if (relations.size() == 0) {
227          // In cases where the sentence is unparseable we are left with only a root node
228          // Thus the Semantic Role Labeler should skip this sentence
229          skipSentence = true;
230        } else if (relations.size() != 1) {
231          throw new IllegalArgumentException("Expected 1 head, found " + relations.size());
232        } else {
233          for (DEPENDENCY_RELATION_TYPE relation : relations) {
234            DEPENDENCY_NODE_TYPE head = this.dependencyOps.getHead(jCas, relation);
235            String label = this.dependencyOps.getLabel(jCas, relation);
236            DEPNode node = tree.get(depNodeToID.get(casDepNode));
237            DEPNode headNode = tree.get(depNodeToID.get(head));
238            node.setHead(headNode, label);
239          }
240        }
241      }
242
243      // Run the SRL
244      if (!skipSentence) {
245        this.predIdentifier.process(tree);
246        this.roleSetClassifier.process(tree);
247        this.srlabeler.process(tree);
248
249        // Extract SRL information and create ClearTK CAS types
250        this.extractSRLInfo(jCas, tokens, tree);
251      }
252    }
253  }
254
255  /**
256   * Converts the output from the ClearParser Semantic Role Labeler to the ClearTK Predicate and
257   * SemanticArgument Types.
258   * 
259   * @param jCas
260   * @param tokens
261   *          - In order list of tokens
262   * @param tree
263   *          - DepdendencyTree output by ClearParser SRLPredict
264   */
265  private void extractSRLInfo(JCas jCas, List<TOKEN_TYPE> tokens, DEPTree tree) {
266    Map<Integer, PREDICATE_TYPE> headIdToPredicate = Maps.newHashMap();
267    Map<PREDICATE_TYPE, List<ARGUMENT_TYPE>> predicateArguments = Maps.newHashMap();
268
269    // Start at node 1, since node 0 is considered the head of the sentence
270    for (int i = 1; i < tree.size(); i++) {
271      // Every ClearParser parserNode will contain an srlInfo field.
272      DEPNode parserNode = tree.get(i);
273      TOKEN_TYPE token = tokens.get(i - 1);
274
275      List<SRLArc> semanticHeads = parserNode.getSHeads();
276      if (semanticHeads.isEmpty()) {
277        continue;
278      }
279
280      // Parse semantic head relations to get SRL triplets
281      for (DEPArc shead : semanticHeads) {
282        int headId = shead.getNode().id;
283        TOKEN_TYPE headToken = tokens.get(headId - 1);
284        PREDICATE_TYPE pred;
285        List<ARGUMENT_TYPE> args;
286        if (!headIdToPredicate.containsKey(headId)) {
287          String rolesetId = shead.getNode().getFeat(DEPLib.FEAT_PB);
288          pred = this.srlOps.createPredicate(jCas, headToken, rolesetId);
289          headIdToPredicate.put(headId, pred);
290          args = Lists.newArrayList();
291          predicateArguments.put(pred, args);
292        } else {
293          pred = headIdToPredicate.get(headId);
294          args = predicateArguments.get(pred);
295        }
296        args.add(this.srlOps.createArgument(jCas, token, shead.getLabel()));
297      }
298    }
299
300    // Store Arguments in Predicate
301    for (Map.Entry<PREDICATE_TYPE, List<ARGUMENT_TYPE>> entry : predicateArguments.entrySet()) {
302      PREDICATE_TYPE predicate = entry.getKey();
303      List<ARGUMENT_TYPE> arguments = entry.getValue();
304      this.srlOps.setPredicateArguments(jCas, predicate, arguments);
305    }
306
307  }
308
309  private AbstractComponent predIdentifier;
310
311  private AbstractComponent roleSetClassifier;
312
313  private AbstractComponent srlabeler;
314}