001/** 002 * Copyright (c) 2009, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.token.pos.impl; 025 026import java.util.ArrayList; 027import java.util.List; 028 029import org.apache.uima.UimaContext; 030import org.apache.uima.jcas.JCas; 031import org.apache.uima.resource.ResourceInitializationException; 032import org.cleartk.ml.Feature; 033import org.cleartk.ml.feature.extractor.CleartkExtractor; 034import org.cleartk.ml.feature.extractor.CleartkExtractorException; 035import org.cleartk.ml.feature.extractor.CoveredTextExtractor; 036import org.cleartk.ml.feature.extractor.FeatureExtractor1; 037import org.cleartk.ml.feature.extractor.CleartkExtractor.Following; 038import org.cleartk.ml.feature.extractor.CleartkExtractor.Ngram; 039import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding; 040import org.cleartk.ml.feature.function.CapitalTypeFeatureFunction; 041import org.cleartk.ml.feature.function.CharacterNgramFeatureFunction; 042import org.cleartk.ml.feature.function.FeatureFunctionExtractor; 043import org.cleartk.ml.feature.function.LowerCaseFeatureFunction; 044import org.cleartk.ml.feature.function.NumericTypeFeatureFunction; 045import org.cleartk.token.pos.PosFeatureExtractor; 046import org.cleartk.token.type.Sentence; 047import org.cleartk.token.type.Token; 048import org.apache.uima.fit.factory.initializable.Initializable; 049 050import com.google.common.collect.Lists; 051 052/** 053 * <br> 054 * Copyright (c) 2009, Regents of the University of Colorado <br> 055 * All rights reserved. 056 * 057 * @author Philip Ogren 058 * 059 */ 060 061public class DefaultFeatureExtractor implements PosFeatureExtractor<Token, Sentence>, Initializable { 062 063 private List<FeatureExtractor1<Token>> simpleExtractors; 064 065 private List<CleartkExtractor<Token, Token>> windowExtractors; 066 067 private List<CleartkExtractor<Token, Token>> windowNGramExtractors; 068 069 public void initialize(UimaContext context) throws ResourceInitializationException { 070 simpleExtractors = Lists.newArrayList(); 071 072 FeatureExtractor1<Token> wordExtractor = new CoveredTextExtractor<Token>(); 073 074 CharacterNgramFeatureFunction.Orientation fromLeft = CharacterNgramFeatureFunction.Orientation.LEFT_TO_RIGHT; 075 CharacterNgramFeatureFunction.Orientation fromRight = CharacterNgramFeatureFunction.Orientation.RIGHT_TO_LEFT; 076 simpleExtractors.add(new FeatureFunctionExtractor<Token>( 077 wordExtractor, 078 new LowerCaseFeatureFunction(), 079 new CapitalTypeFeatureFunction(), 080 new NumericTypeFeatureFunction(), 081 new CharacterNgramFeatureFunction(fromLeft, 0, 1), 082 new CharacterNgramFeatureFunction(fromLeft, 0, 2), 083 new CharacterNgramFeatureFunction(fromLeft, 0, 3), 084 new CharacterNgramFeatureFunction(fromRight, 0, 1), 085 new CharacterNgramFeatureFunction(fromRight, 0, 2), 086 new CharacterNgramFeatureFunction(fromRight, 0, 3), 087 new CharacterNgramFeatureFunction(fromRight, 0, 4), 088 new CharacterNgramFeatureFunction(fromRight, 0, 5), 089 new CharacterNgramFeatureFunction(fromRight, 0, 6))); 090 091 windowExtractors = Lists.newArrayList(); 092 windowExtractors.add(new CleartkExtractor<Token, Token>( 093 Token.class, 094 wordExtractor, 095 new Preceding(2), 096 new Following(2))); 097 098 windowNGramExtractors = Lists.newArrayList(); 099 windowNGramExtractors.add(new CleartkExtractor<Token, Token>(Token.class, wordExtractor, new Ngram( 100 new Preceding(2)), new Ngram(new Following(2)))); 101 } 102 103 public List<Feature> extractFeatures(JCas jCas, Token token, Sentence sentence) 104 throws CleartkExtractorException { 105 List<Feature> features = new ArrayList<Feature>(); 106 107 for (FeatureExtractor1<Token> extractor : simpleExtractors) { 108 features.addAll(extractor.extract(jCas, token)); 109 } 110 111 for (CleartkExtractor<Token, Token> extractor : windowExtractors) { 112 features.addAll(extractor.extractWithin(jCas, token, sentence)); 113 } 114 115 for (CleartkExtractor<Token, Token> extractor : windowNGramExtractors) { 116 features.addAll(extractor.extractWithin(jCas, token, sentence)); 117 } 118 119 return features; 120 } 121 122}