001/** 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024 025package org.cleartk.util.ae.parenthetical; 026 027import java.lang.reflect.Constructor; 028import java.util.Stack; 029 030import org.apache.uima.UimaContext; 031import org.apache.uima.analysis_engine.AnalysisEngineDescription; 032import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 033import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 034import org.apache.uima.fit.descriptor.ConfigurationParameter; 035import org.apache.uima.fit.factory.AnalysisEngineFactory; 036import org.apache.uima.fit.factory.ConfigurationParameterFactory; 037import org.apache.uima.fit.factory.initializable.InitializableFactory; 038import org.apache.uima.fit.util.JCasUtil; 039import org.apache.uima.jcas.JCas; 040import org.apache.uima.jcas.tcas.Annotation; 041import org.apache.uima.resource.ResourceInitializationException; 042import org.cleartk.util.CleartkInitializationException; 043 044/** 045 * <br> 046 * Copyright (c) 2011, Regents of the University of Colorado <br> 047 * All rights reserved. 048 * 049 * @author Philip Ogren 050 */ 051 052public class ParentheticalAnnotator extends JCasAnnotator_ImplBase { 053 054 public static final String PARAM_WINDOW_TYPE_NAME = "windowTypeName"; 055 056 private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. " 057 + "If no value is given, then the entire document will be tokenized at once. "; 058 059 // do not set the default value to 'org.cleartk.token.type.Sentence'. If you do, then unit tests 060 // will break. The symptom will be a tokenizer that doesn't generate any tokens (because there 061 // are no sentences to iterate over. 062 @ConfigurationParameter( 063 name = PARAM_WINDOW_TYPE_NAME, 064 mandatory = false, 065 description = WINDOW_TYPE_DESCRIPTION) 066 private String windowTypeName; 067 068 public static final String PARAM_PARENTHETICAL_TYPE_NAME = "parentheticalTypeName"; 069 070 @ConfigurationParameter( 071 name = PARAM_PARENTHETICAL_TYPE_NAME, 072 description = "class name of the annotations that are created by this annotator.", 073 mandatory = true) 074 private String parentheticalTypeName; 075 076 public static final String PARAM_LEFT_PARENTHESIS = "leftParenthesis"; 077 078 @ConfigurationParameter(name = PARAM_LEFT_PARENTHESIS, defaultValue = "(", mandatory = true) 079 private String leftParenthesis; 080 081 private char leftParen; 082 083 public static final String PARAM_RIGHT_PARENTHESIS = "rightParenthesis"; 084 085 @ConfigurationParameter(name = PARAM_RIGHT_PARENTHESIS, defaultValue = ")", mandatory = true) 086 private String rightParenthesis; 087 088 private char rightParen; 089 090 private Class<? extends Annotation> windowClass; 091 092 private Constructor<? extends Annotation> parentheticalConstructor; 093 094 public void initialize(UimaContext uimaContext) throws ResourceInitializationException { 095 super.initialize(uimaContext); 096 if (windowTypeName != null) 097 windowClass = InitializableFactory.getClass(windowTypeName, Annotation.class); 098 099 if (leftParenthesis.length() != 1) { 100 throw CleartkInitializationException.notSingleCharacter( 101 PARAM_LEFT_PARENTHESIS, 102 leftParenthesis); 103 } 104 leftParen = leftParenthesis.charAt(0); 105 106 if (rightParenthesis.length() != 1) { 107 throw CleartkInitializationException.notSingleCharacter( 108 PARAM_RIGHT_PARENTHESIS, 109 rightParenthesis); 110 } 111 rightParen = rightParenthesis.charAt(0); 112 113 Class<? extends Annotation> parentheticalClass = InitializableFactory.getClass( 114 parentheticalTypeName, 115 Annotation.class); 116 117 try { 118 parentheticalConstructor = parentheticalClass.getConstructor(new Class[] { 119 JCas.class, 120 Integer.TYPE, 121 Integer.TYPE }); 122 } catch (Exception e) { 123 throw new ResourceInitializationException(e); 124 } 125 } 126 127 @Override 128 public void process(JCas jCas) throws AnalysisEngineProcessException { 129 if (windowClass != null) { 130 for (Annotation window : JCasUtil.select(jCas, windowClass)) { 131 String text = window.getCoveredText(); 132 createParentheticals(jCas, text, window.getBegin()); 133 } 134 } else { 135 String text = jCas.getDocumentText(); 136 createParentheticals(jCas, text, 0); 137 } 138 } 139 140 private void createParentheticals(JCas jCas, String text, int offset) 141 throws AnalysisEngineProcessException { 142 Stack<Integer> leftRoundedParens = new Stack<Integer>(); 143 leftRoundedParens.clear(); 144 for (int ci = 0; ci < text.length(); ci++) { 145 char c = text.charAt(ci); 146 if (c == leftParen) { 147 leftRoundedParens.push(ci); 148 } 149 if (c == rightParen && !leftRoundedParens.isEmpty()) { 150 int leftOffset = leftRoundedParens.pop(); 151 Annotation ann; 152 try { 153 ann = parentheticalConstructor.newInstance(jCas, offset + leftOffset, offset + ci + 1); 154 } catch (Exception e) { 155 throw new AnalysisEngineProcessException(e); 156 } 157 ann.addToIndexes(); 158 } 159 } 160 } 161 162 public static AnalysisEngineDescription getDescription( 163 Class<? extends Annotation> parentheticalClass) throws ResourceInitializationException { 164 return getDescription(parentheticalClass, null, '(', ')'); 165 } 166 167 public static AnalysisEngineDescription getDescription( 168 Class<? extends Annotation> parentheticalClass, 169 Class<? extends Annotation> windowClass) throws ResourceInitializationException { 170 return getDescription(parentheticalClass, windowClass, '(', ')'); 171 } 172 173 public static AnalysisEngineDescription getDescription( 174 Class<? extends Annotation> parentheticalClass, 175 Class<? extends Annotation> windowClass, 176 char leftParen, 177 char rightParen) throws ResourceInitializationException { 178 AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription( 179 ParentheticalAnnotator.class, 180 PARAM_LEFT_PARENTHESIS, 181 "" + leftParen, 182 PARAM_RIGHT_PARENTHESIS, 183 "" + rightParen, 184 PARAM_PARENTHETICAL_TYPE_NAME, 185 parentheticalClass.getName()); 186 187 if (windowClass != null) { 188 ConfigurationParameterFactory.addConfigurationParameters( 189 aed, 190 PARAM_WINDOW_TYPE_NAME, 191 windowClass.getName()); 192 } 193 194 return aed; 195 } 196}