001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.feature.syntax;
025
026import java.util.ArrayList;
027import java.util.Arrays;
028import java.util.HashMap;
029import java.util.HashSet;
030import java.util.List;
031import java.util.Map;
032import java.util.Set;
033
034import org.apache.uima.jcas.JCas;
035import org.cleartk.ml.Feature;
036import org.cleartk.ml.feature.extractor.CleartkExtractorException;
037import org.cleartk.ml.feature.extractor.FeatureExtractor1;
038import org.cleartk.syntax.constituent.type.TreebankNode;
039import org.apache.uima.fit.util.JCasUtil;
040
041import com.google.common.collect.Lists;
042
043/**
044 * <br>
045 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
046 * All rights reserved.
047 * 
048 * 
049 * @author Philipp Wetzler
050 * 
051 * 
052 *         The rules for finding the head word were adapted from the ASSERT system for semantic role
053 *         labeling. For more information see:
054 * 
055 *         http://cemantix.org/
056 * 
057 *         and
058 * 
059 *         Shallow Semantic Parsing using Support Vector Machines Sameer S. Pradhan, Wayne Ward,
060 *         Kadri Hacioglu, James H. Martin, Daniel Jurafsky, in Proceedings of the Human Language
061 *         Technology Conference/North American chapter of the Association for Computational
062 *         Linguistics annual meeting (HLT/NAACL-2004), Boston, MA, May 2-7, 2004
063 * 
064 */
065
066public class HeadWordExtractor implements FeatureExtractor1<TreebankNode> {
067
068  static final String[] head1 = {
069      "ADJP JJ",
070      "ADJP JJR",
071      "ADJP JJS",
072      "ADVP RB",
073      "ADVP RBB",
074      "LST LS",
075      "NAC NNS",
076      "NAC NN",
077      "NAC PRP",
078      "NAC NNPS",
079      "NAC NNP",
080      "NX NNS",
081      "NX NN",
082      "NX PRP",
083      "NX NNPS",
084      "NX NNP",
085      "NP NNS",
086      "NP NN",
087      "NP PRP",
088      "NP NNPS",
089      "NP NNP",
090      "NP POS",
091      "NP $",
092      "PP IN",
093      "PP TO",
094      "PP RP",
095      "PRT RP",
096      "S VP",
097      "S1 S",
098      "SBAR IN",
099      "SBAR WHNP",
100      "SBARQ SQ",
101      "SBARQ VP",
102      "SINV VP",
103      "SQ MD",
104      "SQ AUX",
105      "VP VB",
106      "VP VBZ",
107      "VP VBP",
108      "VP VBG",
109      "VP VBN",
110      "VP VBD",
111      "VP AUX",
112      "VP AUXG",
113      "VP TO",
114      "VP MD",
115      "WHADJP WRB",
116      "WHADVP WRB",
117      "WHNP WP",
118      "WHNP WDT",
119      "WHNP WP$",
120      "WHPP IN",
121      "WHPP TO" };
122
123  static final String[] head2 = {
124      "ADJP VBN",
125      "ADJP RB",
126      "NAC NP",
127      "NAC CD",
128      "NAC FW",
129      "NAC ADJP",
130      "NAC JJ",
131      "NX NP",
132      "NX CD",
133      "NX FW",
134      "NX ADJP",
135      "NX JJ",
136      "NP CD",
137      "NP ADJP",
138      "NP JJ",
139      "S SINV",
140      "S SBARQ",
141      "S X",
142      "PRT RB",
143      "PRT IN",
144      "SBAR WHADJP",
145      "SBAR WHADVP",
146      "SBAR WHPP",
147      "SBARQ S",
148      "SBARQ SINV",
149      "SBARQ X",
150      "SINV SBAR",
151      "SQ VP" };
152
153  static final String[] term = {
154      "AUX",
155      "AUXG",
156      "CC",
157      "CD",
158      "DT",
159      "EX",
160      "FW",
161      "IN",
162      "JJ",
163      "JJR",
164      "JJS",
165      "LS",
166      "MD",
167      "NN",
168      "NNS",
169      "NNP",
170      "NNPS",
171      "PDT",
172      "POS",
173      "PRP",
174      "PRP$",
175      "RB",
176      "RBR",
177      "RBS",
178      "RP",
179      "SYM",
180      "TO",
181      "UH",
182      "VB",
183      "VBD",
184      "VBG",
185      "VBN",
186      "VBP",
187      "VBZ",
188      "WDT",
189      "WP",
190      "WP$",
191      "WRB",
192      "#",
193      "$",
194      ".",
195      ",",
196      ":",
197      "-RRB-",
198      "-LRB-",
199      "``",
200      "''",
201      "EOS" };
202
203  static final String[] punc = { "#", "$", ".", ",", ":", "-RRB-", "-LRB-", "``", "''" };
204
205  static Set<String> headRules1;
206
207  static Set<String> headRules2;
208
209  static Set<String> terminals;
210
211  static Set<String> punctuations;
212
213  static Map<String, Integer> cache;
214
215  static Boolean setsInitialized = false;
216
217  static void buildSets() {
218    synchronized (setsInitialized) {
219      if (setsInitialized)
220        return;
221      HeadWordExtractor.headRules1 = new HashSet<String>(Arrays.asList(HeadWordExtractor.head1));
222      HeadWordExtractor.headRules2 = new HashSet<String>(Arrays.asList(HeadWordExtractor.head2));
223      HeadWordExtractor.terminals = new HashSet<String>(Arrays.asList(HeadWordExtractor.term));
224      HeadWordExtractor.punctuations = new HashSet<String>(Arrays.asList(HeadWordExtractor.punc));
225      HeadWordExtractor.cache = new HashMap<String, Integer>();
226      setsInitialized = true;
227    }
228  }
229
230  FeatureExtractor1<TreebankNode> subExtractor;
231
232  boolean includePPHead;
233
234  public HeadWordExtractor(FeatureExtractor1<TreebankNode> subExtractor, boolean includePPHead) {
235    this.subExtractor = subExtractor;
236    this.includePPHead = includePPHead;
237    HeadWordExtractor.buildSets();
238  }
239
240  public HeadWordExtractor(FeatureExtractor1<TreebankNode> subExtractor) {
241    this(subExtractor, false);
242  }
243
244  public List<Feature> extract(JCas jCas, TreebankNode constituent)
245      throws CleartkExtractorException {
246
247    TreebankNode headNode = findHead(constituent);
248    List<Feature> features = new ArrayList<Feature>(extractNode(jCas, headNode, false));
249
250    if (includePPHead && constituent.getNodeType().equals("PP")) {
251      for (int i = 0; i < constituent.getChildren().size(); i++) {
252        TreebankNode child = constituent.getChildren(i);
253
254        if (child.getNodeType().equals("NP")) {
255          features = new ArrayList<Feature>(features);
256          features.addAll(extractNode(jCas, findHead(child), true));
257          break;
258        }
259      }
260    }
261
262    return features;
263  }
264
265  List<Feature> extractNode(JCas jCas, TreebankNode node, boolean specialCasePP)
266      throws CleartkExtractorException {
267    List<Feature> features = subExtractor.extract(jCas, node);
268
269    for (Feature feature : features) {
270      feature.setName(createName(specialCasePP, feature));
271    }
272
273    return features;
274  }
275
276  TreebankNode findHead(TreebankNode parentNode) {
277    TreebankNode cursor = parentNode;
278
279    while (cursor.getChildren() != null && cursor.getChildren().size() > 0)
280      cursor = findHead2(cursor);
281
282    return cursor;
283  }
284
285  TreebankNode findHead2(TreebankNode parentNode) {
286    List<TreebankNode> childNodes = Lists.newArrayList(JCasUtil.select(
287        parentNode.getChildren(),
288        TreebankNode.class));
289    List<String> childTypes = new ArrayList<String>(childNodes.size());
290
291    String parentType = parentNode.getNodeType();
292
293    for (TreebankNode childNode : childNodes)
294      childTypes.add(childNode.getNodeType());
295
296    int headIndex = findHead3(parentType, childTypes);
297
298    return childNodes.get(headIndex);
299  }
300
301  int findHead3(String lhs, List<String> rhss) {
302    StringBuffer keyBuffer = new StringBuffer(lhs + " ->");
303    for (String rhs : rhss)
304      keyBuffer.append(" " + rhs);
305    String key = keyBuffer.toString();
306
307    synchronized (HeadWordExtractor.cache) {
308      if (cache.containsKey(key)) {
309        return cache.get(key);
310      }
311    }
312
313    int currentBestGuess = -1;
314    int currentGuessUncertainty = 10;
315
316    for (int current = 0; current < rhss.size(); current++) {
317      String rhs = rhss.get(current);
318      String rule = lhs + " " + rhs;
319
320      if (currentGuessUncertainty >= 1 && headRules1.contains(rule)) {
321        currentBestGuess = current;
322        currentGuessUncertainty = 1;
323      } else if (currentGuessUncertainty > 2 && lhs != null && lhs.equals(rhs)) {
324        currentBestGuess = current;
325        currentGuessUncertainty = 2;
326      } else if (currentGuessUncertainty >= 3 && headRules2.contains(rule)) {
327        currentBestGuess = current;
328        currentGuessUncertainty = 3;
329      } else if (currentGuessUncertainty >= 5 && !terminals.contains(rhs) && rhs != null
330          && !rhs.equals("PP")) {
331        currentBestGuess = current;
332        currentGuessUncertainty = 5;
333      } else if (currentGuessUncertainty >= 6 && !terminals.contains(rhs)) {
334        currentBestGuess = current;
335        currentGuessUncertainty = 6;
336      } else if (currentGuessUncertainty >= 7) {
337        currentBestGuess = current;
338        currentGuessUncertainty = 7;
339      }
340    }
341
342    synchronized (HeadWordExtractor.cache) {
343      cache.put(key, currentBestGuess);
344    }
345
346    return currentBestGuess;
347  }
348
349  private String createName(boolean specialCasePP, Feature subFeature) {
350    StringBuffer buffer = new StringBuffer();
351
352    if (specialCasePP)
353      buffer.append("PP");
354
355    buffer.append("HeadWord");
356
357    return Feature.createName(buffer.toString(), subFeature.getName());
358  }
359}