001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.feature.syntax; 025 026import java.util.ArrayList; 027import java.util.Arrays; 028import java.util.HashMap; 029import java.util.HashSet; 030import java.util.List; 031import java.util.Map; 032import java.util.Set; 033 034import org.apache.uima.jcas.JCas; 035import org.cleartk.ml.Feature; 036import org.cleartk.ml.feature.extractor.CleartkExtractorException; 037import org.cleartk.ml.feature.extractor.FeatureExtractor1; 038import org.cleartk.syntax.constituent.type.TreebankNode; 039import org.apache.uima.fit.util.JCasUtil; 040 041import com.google.common.collect.Lists; 042 043/** 044 * <br> 045 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 046 * All rights reserved. 047 * 048 * 049 * @author Philipp Wetzler 050 * 051 * 052 * The rules for finding the head word were adapted from the ASSERT system for semantic role 053 * labeling. For more information see: 054 * 055 * http://cemantix.org/ 056 * 057 * and 058 * 059 * Shallow Semantic Parsing using Support Vector Machines Sameer S. Pradhan, Wayne Ward, 060 * Kadri Hacioglu, James H. Martin, Daniel Jurafsky, in Proceedings of the Human Language 061 * Technology Conference/North American chapter of the Association for Computational 062 * Linguistics annual meeting (HLT/NAACL-2004), Boston, MA, May 2-7, 2004 063 * 064 */ 065 066public class HeadWordExtractor implements FeatureExtractor1<TreebankNode> { 067 068 static final String[] head1 = { 069 "ADJP JJ", 070 "ADJP JJR", 071 "ADJP JJS", 072 "ADVP RB", 073 "ADVP RBB", 074 "LST LS", 075 "NAC NNS", 076 "NAC NN", 077 "NAC PRP", 078 "NAC NNPS", 079 "NAC NNP", 080 "NX NNS", 081 "NX NN", 082 "NX PRP", 083 "NX NNPS", 084 "NX NNP", 085 "NP NNS", 086 "NP NN", 087 "NP PRP", 088 "NP NNPS", 089 "NP NNP", 090 "NP POS", 091 "NP $", 092 "PP IN", 093 "PP TO", 094 "PP RP", 095 "PRT RP", 096 "S VP", 097 "S1 S", 098 "SBAR IN", 099 "SBAR WHNP", 100 "SBARQ SQ", 101 "SBARQ VP", 102 "SINV VP", 103 "SQ MD", 104 "SQ AUX", 105 "VP VB", 106 "VP VBZ", 107 "VP VBP", 108 "VP VBG", 109 "VP VBN", 110 "VP VBD", 111 "VP AUX", 112 "VP AUXG", 113 "VP TO", 114 "VP MD", 115 "WHADJP WRB", 116 "WHADVP WRB", 117 "WHNP WP", 118 "WHNP WDT", 119 "WHNP WP$", 120 "WHPP IN", 121 "WHPP TO" }; 122 123 static final String[] head2 = { 124 "ADJP VBN", 125 "ADJP RB", 126 "NAC NP", 127 "NAC CD", 128 "NAC FW", 129 "NAC ADJP", 130 "NAC JJ", 131 "NX NP", 132 "NX CD", 133 "NX FW", 134 "NX ADJP", 135 "NX JJ", 136 "NP CD", 137 "NP ADJP", 138 "NP JJ", 139 "S SINV", 140 "S SBARQ", 141 "S X", 142 "PRT RB", 143 "PRT IN", 144 "SBAR WHADJP", 145 "SBAR WHADVP", 146 "SBAR WHPP", 147 "SBARQ S", 148 "SBARQ SINV", 149 "SBARQ X", 150 "SINV SBAR", 151 "SQ VP" }; 152 153 static final String[] term = { 154 "AUX", 155 "AUXG", 156 "CC", 157 "CD", 158 "DT", 159 "EX", 160 "FW", 161 "IN", 162 "JJ", 163 "JJR", 164 "JJS", 165 "LS", 166 "MD", 167 "NN", 168 "NNS", 169 "NNP", 170 "NNPS", 171 "PDT", 172 "POS", 173 "PRP", 174 "PRP$", 175 "RB", 176 "RBR", 177 "RBS", 178 "RP", 179 "SYM", 180 "TO", 181 "UH", 182 "VB", 183 "VBD", 184 "VBG", 185 "VBN", 186 "VBP", 187 "VBZ", 188 "WDT", 189 "WP", 190 "WP$", 191 "WRB", 192 "#", 193 "$", 194 ".", 195 ",", 196 ":", 197 "-RRB-", 198 "-LRB-", 199 "``", 200 "''", 201 "EOS" }; 202 203 static final String[] punc = { "#", "$", ".", ",", ":", "-RRB-", "-LRB-", "``", "''" }; 204 205 static Set<String> headRules1; 206 207 static Set<String> headRules2; 208 209 static Set<String> terminals; 210 211 static Set<String> punctuations; 212 213 static Map<String, Integer> cache; 214 215 static Boolean setsInitialized = false; 216 217 static void buildSets() { 218 synchronized (setsInitialized) { 219 if (setsInitialized) 220 return; 221 HeadWordExtractor.headRules1 = new HashSet<String>(Arrays.asList(HeadWordExtractor.head1)); 222 HeadWordExtractor.headRules2 = new HashSet<String>(Arrays.asList(HeadWordExtractor.head2)); 223 HeadWordExtractor.terminals = new HashSet<String>(Arrays.asList(HeadWordExtractor.term)); 224 HeadWordExtractor.punctuations = new HashSet<String>(Arrays.asList(HeadWordExtractor.punc)); 225 HeadWordExtractor.cache = new HashMap<String, Integer>(); 226 setsInitialized = true; 227 } 228 } 229 230 FeatureExtractor1<TreebankNode> subExtractor; 231 232 boolean includePPHead; 233 234 public HeadWordExtractor(FeatureExtractor1<TreebankNode> subExtractor, boolean includePPHead) { 235 this.subExtractor = subExtractor; 236 this.includePPHead = includePPHead; 237 HeadWordExtractor.buildSets(); 238 } 239 240 public HeadWordExtractor(FeatureExtractor1<TreebankNode> subExtractor) { 241 this(subExtractor, false); 242 } 243 244 public List<Feature> extract(JCas jCas, TreebankNode constituent) 245 throws CleartkExtractorException { 246 247 TreebankNode headNode = findHead(constituent); 248 List<Feature> features = new ArrayList<Feature>(extractNode(jCas, headNode, false)); 249 250 if (includePPHead && constituent.getNodeType().equals("PP")) { 251 for (int i = 0; i < constituent.getChildren().size(); i++) { 252 TreebankNode child = constituent.getChildren(i); 253 254 if (child.getNodeType().equals("NP")) { 255 features = new ArrayList<Feature>(features); 256 features.addAll(extractNode(jCas, findHead(child), true)); 257 break; 258 } 259 } 260 } 261 262 return features; 263 } 264 265 List<Feature> extractNode(JCas jCas, TreebankNode node, boolean specialCasePP) 266 throws CleartkExtractorException { 267 List<Feature> features = subExtractor.extract(jCas, node); 268 269 for (Feature feature : features) { 270 feature.setName(createName(specialCasePP, feature)); 271 } 272 273 return features; 274 } 275 276 TreebankNode findHead(TreebankNode parentNode) { 277 TreebankNode cursor = parentNode; 278 279 while (cursor.getChildren() != null && cursor.getChildren().size() > 0) 280 cursor = findHead2(cursor); 281 282 return cursor; 283 } 284 285 TreebankNode findHead2(TreebankNode parentNode) { 286 List<TreebankNode> childNodes = Lists.newArrayList(JCasUtil.select( 287 parentNode.getChildren(), 288 TreebankNode.class)); 289 List<String> childTypes = new ArrayList<String>(childNodes.size()); 290 291 String parentType = parentNode.getNodeType(); 292 293 for (TreebankNode childNode : childNodes) 294 childTypes.add(childNode.getNodeType()); 295 296 int headIndex = findHead3(parentType, childTypes); 297 298 return childNodes.get(headIndex); 299 } 300 301 int findHead3(String lhs, List<String> rhss) { 302 StringBuffer keyBuffer = new StringBuffer(lhs + " ->"); 303 for (String rhs : rhss) 304 keyBuffer.append(" " + rhs); 305 String key = keyBuffer.toString(); 306 307 synchronized (HeadWordExtractor.cache) { 308 if (cache.containsKey(key)) { 309 return cache.get(key); 310 } 311 } 312 313 int currentBestGuess = -1; 314 int currentGuessUncertainty = 10; 315 316 for (int current = 0; current < rhss.size(); current++) { 317 String rhs = rhss.get(current); 318 String rule = lhs + " " + rhs; 319 320 if (currentGuessUncertainty >= 1 && headRules1.contains(rule)) { 321 currentBestGuess = current; 322 currentGuessUncertainty = 1; 323 } else if (currentGuessUncertainty > 2 && lhs != null && lhs.equals(rhs)) { 324 currentBestGuess = current; 325 currentGuessUncertainty = 2; 326 } else if (currentGuessUncertainty >= 3 && headRules2.contains(rule)) { 327 currentBestGuess = current; 328 currentGuessUncertainty = 3; 329 } else if (currentGuessUncertainty >= 5 && !terminals.contains(rhs) && rhs != null 330 && !rhs.equals("PP")) { 331 currentBestGuess = current; 332 currentGuessUncertainty = 5; 333 } else if (currentGuessUncertainty >= 6 && !terminals.contains(rhs)) { 334 currentBestGuess = current; 335 currentGuessUncertainty = 6; 336 } else if (currentGuessUncertainty >= 7) { 337 currentBestGuess = current; 338 currentGuessUncertainty = 7; 339 } 340 } 341 342 synchronized (HeadWordExtractor.cache) { 343 cache.put(key, currentBestGuess); 344 } 345 346 return currentBestGuess; 347 } 348 349 private String createName(boolean specialCasePP, Feature subFeature) { 350 StringBuffer buffer = new StringBuffer(); 351 352 if (specialCasePP) 353 buffer.append("PP"); 354 355 buffer.append("HeadWord"); 356 357 return Feature.createName(buffer.toString(), subFeature.getName()); 358 } 359}