001/* 002 * Copyright (c) 2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.clearnlp; 025 026import java.io.IOException; 027import java.util.ArrayList; 028import java.util.List; 029 030import org.apache.uima.UimaContext; 031import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 032import org.apache.uima.jcas.JCas; 033import org.apache.uima.jcas.cas.TOP; 034import org.apache.uima.jcas.tcas.Annotation; 035import org.apache.uima.resource.ResourceInitializationException; 036import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 037import org.apache.uima.fit.descriptor.ConfigurationParameter; 038import org.apache.uima.fit.util.JCasUtil; 039 040import com.google.common.annotations.Beta; 041import com.google.common.collect.HashMultimap; 042import com.google.common.collect.Lists; 043import com.google.common.collect.Multimap; 044import com.clearnlp.component.AbstractComponent; 045import com.clearnlp.dependency.DEPFeat; 046import com.clearnlp.dependency.DEPNode; 047import com.clearnlp.dependency.DEPTree; 048import com.clearnlp.nlp.NLPGetter; 049import com.clearnlp.nlp.NLPLib; 050import com.clearnlp.reader.AbstractReader; 051 052/** 053 * <br> 054 * Copyright (c) 2012, Regents of the University of Colorado <br> 055 * All rights reserved. 056 * <p> 057 * This class provides the base implementation class for the UIMA/ClearTK wrapper for the ClearNLP 058 * dependency parser. Subclasses should override methods for creating and setting properties on 059 * dependency annotations 060 * 061 * <p> 062 * This parser is available here: 063 * <p> 064 * http://clearnlp.googlecode.com 065 * <p> 066 * 067 * @author Lee Becker 068 * 069 */ 070@Beta 071public abstract class DependencyParser_ImplBase<WINDOW_TYPE extends Annotation, TOKEN_TYPE extends Annotation, DEPENDENCY_NODE_TYPE extends TOP, DEPENDENCY_ROOT_NODE_TYPE extends DEPENDENCY_NODE_TYPE, DEPENDENCY_RELATION_TYPE extends TOP> 072 extends JCasAnnotator_ImplBase { 073 074 public static final String DEFAULT_MODEL_PATH = "general-en"; 075 076 public static final String PARAM_PARSER_MODEL_PATH = "parserModelPath"; 077 078 @ConfigurationParameter( 079 name = PARAM_PARSER_MODEL_PATH, 080 mandatory = false, 081 description = "This parameter provides the file name of the dependency parser model required by the factory method provided by ClearParserUtil.", 082 defaultValue=DEFAULT_MODEL_PATH) 083 private String parserModelPath; 084 085 086 public static final String PARAM_LANGUAGE_CODE = "languageCode"; 087 088 @ConfigurationParameter( 089 name = PARAM_LANGUAGE_CODE, 090 mandatory = false, 091 description = "Language code for the dependency parser (default value=en).", 092 defaultValue = AbstractReader.LANG_EN) 093 private String languageCode; 094 095 public static final String PARAM_WINDOW_CLASS = "windowClass"; 096 097 private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. " 098 + "By default, the tokenizer will tokenize a document sentence by sentence. If you do not want to precede tokenization with" 099 + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'"; 100 101 @ConfigurationParameter( 102 name = PARAM_WINDOW_CLASS, 103 mandatory = false, 104 description = WINDOW_TYPE_DESCRIPTION, 105 defaultValue = "org.cleartk.token.type.Sentence") 106 private Class<WINDOW_TYPE> windowClass; 107 108 private TokenOps<TOKEN_TYPE> tokenOps; 109 110 private DependencyOps<DEPENDENCY_NODE_TYPE, TOKEN_TYPE, DEPENDENCY_ROOT_NODE_TYPE, WINDOW_TYPE, DEPENDENCY_RELATION_TYPE> dependencyOps; 111 112 public DependencyParser_ImplBase( 113 TokenOps<TOKEN_TYPE> tokenOps, 114 DependencyOps<DEPENDENCY_NODE_TYPE, TOKEN_TYPE, DEPENDENCY_ROOT_NODE_TYPE, WINDOW_TYPE, DEPENDENCY_RELATION_TYPE> dependencyOps) { 115 this.tokenOps = tokenOps; 116 this.dependencyOps = dependencyOps; 117 } 118 119 @Override 120 public void initialize(UimaContext aContext) throws ResourceInitializationException { 121 super.initialize(aContext); 122 try { 123 this.parser = NLPGetter.getComponent( 124 this.parserModelPath, 125 this.languageCode, 126 NLPLib.MODE_DEP); 127 } catch (IOException e) { 128 throw new ResourceInitializationException(e); 129 } 130 131 } 132 133 @Override 134 public void process(JCas jCas) throws AnalysisEngineProcessException { 135 136 for (WINDOW_TYPE window : JCasUtil.select(jCas, this.windowClass)) { 137 List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window); 138 139 // Extract data from CAS and stuff it into ClearNLP data structures 140 DEPTree tree = new DEPTree(); 141 for (int i = 0; i < tokens.size(); i++) { 142 TOKEN_TYPE token = tokens.get(i); 143 String lemma = this.tokenOps.getLemma(jCas, token); 144 String pos = this.tokenOps.getPos(jCas, token); 145 DEPNode node = new DEPNode(i + 1, token.getCoveredText(), lemma, pos, new DEPFeat()); 146 tree.add(node); 147 } 148 149 // Run the parser 150 this.parser.process(tree); 151 152 // convert ClearNLP output back into CAS type system annotation 153 this.addTreeToCas(jCas, tree, window, tokens); 154 } 155 } 156 157 /** 158 * Takes parsed tree from ClearNLP and converts it into dependency type system. 159 * 160 * @param jCas 161 * @param tree 162 * @param window 163 * @param tokens 164 */ 165 private void addTreeToCas(JCas jCas, DEPTree tree, WINDOW_TYPE window, List<TOKEN_TYPE> tokens) { 166 167 ArrayList<DEPENDENCY_NODE_TYPE> nodes = new ArrayList<DEPENDENCY_NODE_TYPE>(tree.size()); 168 DEPENDENCY_ROOT_NODE_TYPE rootNode = this.dependencyOps.createRootNode(jCas, window); 169 nodes.add(rootNode); 170 171 for (int i = 0; i < tokens.size(); i++) { 172 TOKEN_TYPE token = tokens.get(i); 173 nodes.add(this.dependencyOps.createNode(jCas, token)); 174 } 175 176 Multimap<DEPENDENCY_NODE_TYPE, DEPENDENCY_RELATION_TYPE> headRelations = HashMultimap.create(); 177 Multimap<DEPENDENCY_NODE_TYPE, DEPENDENCY_RELATION_TYPE> childRelations = HashMultimap.create(); 178 // extract relation arcs from ClearNLP parse tree 179 for (int i = 0; i < tree.size(); i++) { 180 DEPNode parserNode = tree.get(i); 181 if (parserNode.hasHead()) { 182 int headIndex = parserNode.getHead().id; 183 DEPENDENCY_NODE_TYPE node = nodes.get(i); 184 DEPENDENCY_NODE_TYPE headNode = nodes.get(headIndex); 185 DEPENDENCY_RELATION_TYPE rel = this.dependencyOps.createRelation( 186 jCas, 187 headNode, 188 node, 189 parserNode.getLabel()); 190 191 headRelations.put(node, rel); 192 childRelations.put(headNode, rel); 193 } 194 } 195 196 // finalize nodes: add links between nodes and relations 197 for (DEPENDENCY_NODE_TYPE node : nodes) { 198 this.dependencyOps.setHeadRelations(jCas, node, Lists.newArrayList(headRelations.get(node))); 199 this.dependencyOps.setChildRelations(jCas, node, Lists.newArrayList(childRelations.get(node))); 200 node.addToIndexes(); 201 } 202 } 203 204 private AbstractComponent parser; 205}