001/*
002 * Copyright (c) 2012, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.clearnlp;
025
026import java.io.IOException;
027import java.util.ArrayList;
028import java.util.List;
029
030import org.apache.uima.UimaContext;
031import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
032import org.apache.uima.jcas.JCas;
033import org.apache.uima.jcas.cas.TOP;
034import org.apache.uima.jcas.tcas.Annotation;
035import org.apache.uima.resource.ResourceInitializationException;
036import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
037import org.apache.uima.fit.descriptor.ConfigurationParameter;
038import org.apache.uima.fit.util.JCasUtil;
039
040import com.google.common.annotations.Beta;
041import com.google.common.collect.HashMultimap;
042import com.google.common.collect.Lists;
043import com.google.common.collect.Multimap;
044import com.clearnlp.component.AbstractComponent;
045import com.clearnlp.dependency.DEPFeat;
046import com.clearnlp.dependency.DEPNode;
047import com.clearnlp.dependency.DEPTree;
048import com.clearnlp.nlp.NLPGetter;
049import com.clearnlp.nlp.NLPLib;
050import com.clearnlp.reader.AbstractReader;
051
052/**
053 * <br>
054 * Copyright (c) 2012, Regents of the University of Colorado <br>
055 * All rights reserved.
056 * <p>
057 * This class provides the base implementation class for the UIMA/ClearTK wrapper for the ClearNLP
058 * dependency parser. Subclasses should override methods for creating and setting properties on
059 * dependency annotations
060 * 
061 * <p>
062 * This parser is available here:
063 * <p>
064 * http://clearnlp.googlecode.com
065 * <p>
066 * 
067 * @author Lee Becker
068 * 
069 */
070@Beta
071public abstract class DependencyParser_ImplBase<WINDOW_TYPE extends Annotation, TOKEN_TYPE extends Annotation, DEPENDENCY_NODE_TYPE extends TOP, DEPENDENCY_ROOT_NODE_TYPE extends DEPENDENCY_NODE_TYPE, DEPENDENCY_RELATION_TYPE extends TOP>
072    extends JCasAnnotator_ImplBase {
073
074  public static final String DEFAULT_MODEL_PATH = "general-en";
075
076  public static final String PARAM_PARSER_MODEL_PATH = "parserModelPath";
077
078  @ConfigurationParameter(
079      name = PARAM_PARSER_MODEL_PATH,
080      mandatory = false,
081      description = "This parameter provides the file name of the dependency parser model required by the factory method provided by ClearParserUtil.",
082      defaultValue=DEFAULT_MODEL_PATH)
083  private String parserModelPath;
084
085
086  public static final String PARAM_LANGUAGE_CODE = "languageCode";
087
088  @ConfigurationParameter(
089      name = PARAM_LANGUAGE_CODE,
090      mandatory = false,
091      description = "Language code for the dependency parser (default value=en).",
092      defaultValue = AbstractReader.LANG_EN)
093  private String languageCode;
094
095  public static final String PARAM_WINDOW_CLASS = "windowClass";
096
097  private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. "
098      + "By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization with"
099      + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'";
100
101  @ConfigurationParameter(
102      name = PARAM_WINDOW_CLASS,
103      mandatory = false,
104      description = WINDOW_TYPE_DESCRIPTION,
105      defaultValue = "org.cleartk.token.type.Sentence")
106  private Class<WINDOW_TYPE> windowClass;
107
108  private TokenOps<TOKEN_TYPE> tokenOps;
109
110  private DependencyOps<DEPENDENCY_NODE_TYPE, TOKEN_TYPE, DEPENDENCY_ROOT_NODE_TYPE, WINDOW_TYPE, DEPENDENCY_RELATION_TYPE> dependencyOps;
111
112  public DependencyParser_ImplBase(
113      TokenOps<TOKEN_TYPE> tokenOps,
114      DependencyOps<DEPENDENCY_NODE_TYPE, TOKEN_TYPE, DEPENDENCY_ROOT_NODE_TYPE, WINDOW_TYPE, DEPENDENCY_RELATION_TYPE> dependencyOps) {
115    this.tokenOps = tokenOps;
116    this.dependencyOps = dependencyOps;
117  }
118
119  @Override
120  public void initialize(UimaContext aContext) throws ResourceInitializationException {
121    super.initialize(aContext);
122    try {
123      this.parser = NLPGetter.getComponent(
124          this.parserModelPath,
125          this.languageCode,
126          NLPLib.MODE_DEP);
127    } catch (IOException e) {
128      throw new ResourceInitializationException(e);
129    }
130    
131  }
132
133  @Override
134  public void process(JCas jCas) throws AnalysisEngineProcessException {
135
136    for (WINDOW_TYPE window : JCasUtil.select(jCas, this.windowClass)) {
137      List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window);
138
139      // Extract data from CAS and stuff it into ClearNLP data structures
140      DEPTree tree = new DEPTree();
141      for (int i = 0; i < tokens.size(); i++) {
142        TOKEN_TYPE token = tokens.get(i);
143        String lemma = this.tokenOps.getLemma(jCas, token);
144        String pos = this.tokenOps.getPos(jCas, token);
145        DEPNode node = new DEPNode(i + 1, token.getCoveredText(), lemma, pos, new DEPFeat());
146        tree.add(node);
147      }
148
149      // Run the parser
150      this.parser.process(tree);
151
152      // convert ClearNLP output back into CAS type system annotation
153      this.addTreeToCas(jCas, tree, window, tokens);
154    }
155  }
156
157  /**
158   * Takes parsed tree from ClearNLP and converts it into dependency type system.
159   * 
160   * @param jCas
161   * @param tree
162   * @param window
163   * @param tokens
164   */
165  private void addTreeToCas(JCas jCas, DEPTree tree, WINDOW_TYPE window, List<TOKEN_TYPE> tokens) {
166
167    ArrayList<DEPENDENCY_NODE_TYPE> nodes = new ArrayList<DEPENDENCY_NODE_TYPE>(tree.size());
168    DEPENDENCY_ROOT_NODE_TYPE rootNode = this.dependencyOps.createRootNode(jCas, window);
169    nodes.add(rootNode);
170
171    for (int i = 0; i < tokens.size(); i++) {
172      TOKEN_TYPE token = tokens.get(i);
173      nodes.add(this.dependencyOps.createNode(jCas, token));
174    }
175
176    Multimap<DEPENDENCY_NODE_TYPE, DEPENDENCY_RELATION_TYPE> headRelations = HashMultimap.create();
177    Multimap<DEPENDENCY_NODE_TYPE, DEPENDENCY_RELATION_TYPE> childRelations = HashMultimap.create();
178    // extract relation arcs from ClearNLP parse tree
179    for (int i = 0; i < tree.size(); i++) {
180      DEPNode parserNode = tree.get(i);
181      if (parserNode.hasHead()) {
182        int headIndex = parserNode.getHead().id;
183        DEPENDENCY_NODE_TYPE node = nodes.get(i);
184        DEPENDENCY_NODE_TYPE headNode = nodes.get(headIndex);
185        DEPENDENCY_RELATION_TYPE rel = this.dependencyOps.createRelation(
186            jCas,
187            headNode,
188            node,
189            parserNode.getLabel());
190
191        headRelations.put(node, rel);
192        childRelations.put(headNode, rel);
193      }
194    }
195
196    // finalize nodes: add links between nodes and relations
197    for (DEPENDENCY_NODE_TYPE node : nodes) {
198      this.dependencyOps.setHeadRelations(jCas, node, Lists.newArrayList(headRelations.get(node)));
199      this.dependencyOps.setChildRelations(jCas, node, Lists.newArrayList(childRelations.get(node)));
200      node.addToIndexes();
201    }
202  }
203
204  private AbstractComponent parser;
205}