001/** 
002 * Copyright (c) 2010, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.feature;
025
026import java.util.Collections;
027import java.util.List;
028
029import org.apache.uima.jcas.JCas;
030import org.apache.uima.jcas.tcas.Annotation;
031import org.cleartk.ml.Feature;
032import org.cleartk.ml.feature.extractor.NamedFeatureExtractor1;
033
034/**
035 * <br>
036 * Copyright (c) 2010, Regents of the University of Colorado <br>
037 * All rights reserved.
038 * 
039 * Extract a slice of the text covered by the annotation. Handles negative slice indices to make it
040 * easy to slice from the end of the string.
041 * 
042 * @author Steven Bethard
043 */
044public class TextSliceExtractor<T extends Annotation> implements NamedFeatureExtractor1<T> {
045
046  private int start;
047
048  private int stop;
049  
050  private String featureName;
051
052  /**
053   * Create an extractor for a given slice of the text. E.g.
054   * <code>new TextSliceExtractor(1, -1)</code> would extract all of the text but its first and last
055   * characters.
056   * 
057   * @param start
058   *          The first character offset of the slice. If negative, it is assumed to count backwards
059   *          from the end of the string. If the offset falls before the start of the string, the
060   *          start of the string will be used instead.
061   * @param stop
062   *          The last character offset of the slice. If negative, it is assumed to count backwards
063   *          from the end of the string. If the offset falls after the end of the string, the end
064   *          of the string will be used instead.
065   */
066  public TextSliceExtractor(int start, int stop) {
067    this.start = start;
068    this.stop = stop;
069    this.featureName = "Suffix";
070  }
071
072  /**
073   * Create an extractor for a slice of text from a single offset to the end of the string. E.g.
074   * <code>new TextSliceExtractor(-2)</code> would extract a suffix of length 2 from the text.
075   * 
076   * @param start
077   *          The first character offset of the slice. If negative, it is assumed to count backwards
078   *          from the end of the string. If the offset falls before the start of the string, the
079   *          start of the string will be used instead.
080   */
081  public TextSliceExtractor(int start) {
082    this(start, Integer.MAX_VALUE);
083  }
084  
085  @Override
086  public String getFeatureName() {
087    return this.featureName;
088  }
089
090  public List<Feature> extract(JCas view, T focusAnnotation) {
091    String text = focusAnnotation.getCoveredText();
092    int startOffset = this.start;
093    if (startOffset < 0) {
094      startOffset += text.length();
095    }
096    if (startOffset < 0) {
097      startOffset = 0;
098    }
099    int stopOffset = this.stop;
100    if (stopOffset < 0) {
101      stopOffset += text.length();
102    }
103    if (stopOffset > text.length()) {
104      stopOffset = text.length();
105    }
106    text = text.substring(startOffset, stopOffset);
107    return Collections.singletonList(new Feature(this.featureName, text));
108  }
109
110}