001/* 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.syntax.constituent.type; 025 026import java.io.PrintStream; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.Collection; 030import java.util.List; 031 032import org.apache.uima.jcas.JCas; 033import org.apache.uima.jcas.cas.FSArray; 034import org.apache.uima.jcas.tcas.Annotation; 035import org.apache.uima.fit.util.JCasUtil; 036 037/** 038 * <br> 039 * Copyright (c) 2011, Regents of the University of Colorado <br> 040 * All rights reserved. 041 * 042 * @author Steven Bethard 043 */ 044public class TreebankNodeUtil { 045 046 /** 047 * Selects a single TreebankNode leaf that has the same span as the given annotation. 048 * 049 * @param jCas 050 * The JCas containing the TreebankNodes. 051 * @param annotation 052 * The Annotation whose span should match a TreebankNode leaf. 053 * @return The single TreebankNode leaf that matches the annotation, or null if no such annotation 054 * exists. 055 */ 056 public static TreebankNode selectMatchingLeaf(JCas jCas, Annotation annotation) { 057 TreebankNode leaf = null; 058 for (TreebankNode node : JCasUtil.selectCovered(jCas, TreebankNode.class, annotation)) { 059 if (node.getLeaf() && node.getBegin() == annotation.getBegin() 060 && node.getEnd() == annotation.getEnd()) { 061 if (leaf == null) { 062 leaf = node; 063 } else { 064 throw new IllegalArgumentException(String.format( 065 "expected one leaf matching annotation %s, found %s", 066 annotation, 067 Arrays.asList(leaf, node))); 068 } 069 } 070 } 071 return leaf; 072 } 073 074 /** 075 * Selects the highest TreebankNode in the parse tree that has the same span as the given 076 * annotation. 077 * 078 * @param jCas 079 * The JCas containing the TreebankNodes. 080 * @param annotation 081 * The Annotation whose span should be matched. 082 * @return The highest TreebankNode matching the given span, or null if no such annotation exists. 083 */ 084 public static TreebankNode selectHighestMatchingTreebankNode(JCas jCas, Annotation annotation) { 085 TreebankNode highestNode = null; 086 int smallestDepth = Integer.MAX_VALUE; 087 for (TreebankNode node : JCasUtil.selectCovered(jCas, TreebankNode.class, annotation)) { 088 if (node.getBegin() == annotation.getBegin() && node.getEnd() == annotation.getEnd()) { 089 int depth = getDepth(node); 090 if (depth < smallestDepth) { 091 highestNode = node; 092 smallestDepth = depth; 093 } 094 } 095 } 096 return highestNode; 097 } 098 099 /** 100 * Selects the highest TreebankNode in the parse tree that is at least partially covered by the 101 * given annotation. 102 * 103 * @param jCas 104 * The JCas containing the TreebankNodes. 105 * @param annotation 106 * The Annotation whose span should be matched. 107 * @return The highest TreebankNode at least partially covered by the given span, or null if no 108 * such annotation exists. 109 */ 110 public static TreebankNode selectHighestCoveredTreebankNode(JCas jCas, Annotation annotation) { 111 TreebankNode highestNode = null; 112 int smallestDepth = Integer.MAX_VALUE; 113 for (TreebankNode node : JCasUtil.selectCovered(jCas, TreebankNode.class, annotation)) { 114 if (annotation.getBegin() <= node.getBegin() && node.getEnd() <= annotation.getEnd()) { 115 int depth = getDepth(node); 116 if (depth < smallestDepth) { 117 highestNode = node; 118 smallestDepth = depth; 119 } 120 } 121 } 122 return highestNode; 123 } 124 125 /** 126 * Calculates the depth of the TreebankNode. The root node has depth 0, children of the root node 127 * have depth 1, etc. 128 * 129 * @param node 130 * The TreebankNode whose depth is to be calculated. 131 * @return The depth of the TreebankNode. 132 */ 133 public static int getDepth(TreebankNode node) { 134 int depth = -1; 135 while (node != null) { 136 depth += 1; 137 node = node.getParent(); 138 } 139 return depth; 140 } 141 142 /** 143 * Find the path from a TreebankNode to the root of the tree it belongs to. 144 * 145 * @param startNode 146 * The start node of the path 147 * 148 * @return A list of TreebankNodes that make up the path from <b>startNode</b> to the root of the 149 * tree 150 */ 151 public static List<TreebankNode> getPathToRoot(TreebankNode startNode) { 152 List<TreebankNode> nlist = new ArrayList<TreebankNode>(20); 153 TreebankNode cursorNode = startNode; 154 155 while (cursorNode != null) { 156 nlist.add(cursorNode); 157 cursorNode = cursorNode.getParent(); 158 } 159 160 return nlist; 161 } 162 163 /** 164 * Representation of a path from one TreebankNode to another, via a common ancestor in the tree. 165 */ 166 public static class TreebankNodePath { 167 private List<TreebankNode> sourceToAncestor; 168 169 private TreebankNode commonAncestor; 170 171 private List<TreebankNode> targetToAncestor; 172 173 public TreebankNodePath( 174 TreebankNode commonAncestor, 175 List<TreebankNode> sourceToAncestor, 176 List<TreebankNode> targetToAncestor) { 177 this.commonAncestor = commonAncestor; 178 this.sourceToAncestor = sourceToAncestor; 179 this.targetToAncestor = targetToAncestor; 180 } 181 182 public TreebankNode getCommonAncestor() { 183 return this.commonAncestor; 184 } 185 186 public List<TreebankNode> getSourceToAncestorPath() { 187 return this.sourceToAncestor; 188 } 189 190 public List<TreebankNode> getTargetToAncestorPath() { 191 return this.targetToAncestor; 192 } 193 } 194 195 /** 196 * Get the path from the source TreebankNode to the target TreebankNode via the least common 197 * ancestor. 198 * 199 * @param source 200 * The TreebankNode where the path should start. 201 * @param target 202 * The TreebankNode where the path should end. 203 * @return The path from the source node to the target node. 204 */ 205 public static TreebankNodePath getPath(TreebankNode source, TreebankNode target) { 206 List<TreebankNode> sourceToRoot = getPathToRoot(source); 207 List<TreebankNode> targetToRoot = getPathToRoot(target); 208 209 TreebankNode ancestor = null; 210 while (sourceToRoot.size() > 0 && targetToRoot.size() > 0 211 && sourceToRoot.get(sourceToRoot.size() - 1) == targetToRoot.get(targetToRoot.size() - 1)) { 212 ancestor = sourceToRoot.remove(sourceToRoot.size() - 1); 213 ancestor = targetToRoot.remove(targetToRoot.size() - 1); 214 } 215 216 return new TreebankNodePath(ancestor, sourceToRoot, targetToRoot); 217 } 218 219 /** 220 * Format the TreebankNode as a Penn-Treebank-style parenthesized string. 221 * 222 * @param node 223 * The TreebankNode to be formatted. 224 * @return A parenthesized Penn-Treebank-style string. 225 */ 226 public static String toTreebankString(TreebankNode node) { 227 StringBuilder builder = new StringBuilder(); 228 builder.append('(').append(node.getNodeType()); 229 if (node.getLeaf()) { 230 builder.append(' ').append(node.getCoveredText()); 231 } else { 232 for (TreebankNode child : JCasUtil.select(node.getChildren(), TreebankNode.class)) { 233 builder.append(' ').append(toTreebankString(child)); 234 } 235 } 236 builder.append(')'); 237 return builder.toString(); 238 } 239 240 public static TreebankNode getParent(TreebankNode node) { 241 if (node != null) { 242 node = node.getParent(); 243 } 244 return node; 245 } 246 247 public static TreebankNode getAncestorWithType(TreebankNode node, String type) { 248 while (node != null && !node.getNodeType().equals(type)) { 249 node = node.getParent(); 250 } 251 return node; 252 } 253 254 /** 255 * Create a leaf TreebankNode in a JCas. 256 * 257 * @param jCas 258 * The JCas which the annotation should be added to. 259 * @param begin 260 * The begin offset of the node. 261 * @param end 262 * The end offset of the node. 263 * @param nodeType 264 * The part of speech tag of the node. 265 * @return The TreebankNode which was added to the JCas. 266 */ 267 public static TreebankNode newNode(JCas jCas, int begin, int end, String nodeType) { 268 TreebankNode node = new TreebankNode(jCas, begin, end); 269 node.setNodeType(nodeType); 270 node.setChildren(new FSArray(jCas, 0)); 271 node.setLeaf(true); 272 node.addToIndexes(); 273 return node; 274 } 275 276 /** 277 * Create a branch TreebankNode in a JCas. The offsets of this node will be determined by its 278 * children. 279 * 280 * @param jCas 281 * The JCas which the annotation should be added to. 282 * @param nodeType 283 * The phrase type tag of the node. 284 * @param children 285 * The TreebankNode children of the node. 286 * @return The TreebankNode which was added to the JCas. 287 */ 288 public static TreebankNode newNode(JCas jCas, String nodeType, TreebankNode... children) { 289 int begin = children[0].getBegin(); 290 int end = children[children.length - 1].getEnd(); 291 TreebankNode node = new TreebankNode(jCas, begin, end); 292 node.setNodeType(nodeType); 293 node.addToIndexes(); 294 FSArray fsArray = new FSArray(jCas, children.length); 295 fsArray.copyFromArray(children, 0, 0, children.length); 296 node.setChildren(fsArray); 297 for (TreebankNode child : children) { 298 child.setParent(node); 299 } 300 return node; 301 } 302 303 public static TopTreebankNode getTopNode(TreebankNode node) { 304 if (node instanceof TopTreebankNode) 305 return (TopTreebankNode) node; 306 307 TreebankNode parent = node.getParent(); 308 while (parent != null) { 309 if (parent instanceof TopTreebankNode) 310 return (TopTreebankNode) parent; 311 node = parent; 312 parent = node.getParent(); 313 } 314 return null; 315 } 316 317 /** 318 * A "pretty print" of this node that may be useful for e.g. debugging. 319 */ 320 public static void print(PrintStream out, TreebankNode node) { 321 out.println(print(node, 0)); 322 } 323 324 private static String print(TreebankNode node, int tabs) { 325 StringBuffer returnValue = new StringBuffer(); 326 String tabString = getTabs(tabs); 327 returnValue.append(tabString + node.getNodeType()); 328 if (node.getNodeValue() != null) 329 returnValue.append(":" + node.getNodeValue() + "\n"); 330 else { 331 returnValue.append(":" + node.getCoveredText() + "\n"); 332 } 333 if (node.getChildren().size() > 0) { 334 Collection<TreebankNode> children = JCasUtil.select(node.getChildren(), TreebankNode.class); 335 for (TreebankNode child : children) { 336 returnValue.append(print(child, (tabs + 1))); 337 } 338 } 339 return returnValue.toString(); 340 } 341 342 private static String getTabs(int tabs) { 343 char[] chars = new char[tabs]; 344 Arrays.fill(chars, ' '); 345 return new String(chars); 346 } 347}