001/*
002 * Copyright (c) 2011, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.ml.feature.extractor;
025
026import java.util.ArrayList;
027import java.util.HashMap;
028import java.util.Iterator;
029import java.util.List;
030import java.util.Locale;
031import java.util.Map;
032
033import org.apache.uima.fit.util.JCasUtil;
034import org.apache.uima.jcas.JCas;
035import org.apache.uima.jcas.tcas.Annotation;
036import org.cleartk.ml.Feature;
037
038import com.google.common.base.Joiner;
039import com.google.common.collect.LinkedHashMultiset;
040import com.google.common.collect.Multiset;
041
042/**
043 * A feature extractor that finds other {@link Annotation}s in the context of a focus annotation and
044 * extracts features from these. It can be used, for example, to:
045 * <ul>
046 * <li>Get the text of the 2 tokens before a focus annotation</li>
047 * <li>Get the parts of speech of the 3 tokens after a focus annotation</li>
048 * <li>Get the tokens after a focus annotation, beginning 2 after and ending 5 after, as a bag of
049 * words</li>
050 * <li>Get an ngram concatenating the stem of the first word before a focus annotation and the first
051 * word contained in the focus annotation</li>
052 * <li></li>
053 * </ul>
054 * 
055 * <br>
056 * Copyright (c) 2011, Regents of the University of Colorado <br>
057 * All rights reserved.
058 * 
059 * @author Steven Bethard
060 */
061public class CleartkExtractor<FOCUS_T extends Annotation, SEARCH_T extends Annotation> implements
062    FeatureExtractor1<FOCUS_T>, FeatureExtractor2<FOCUS_T, FOCUS_T> {
063
064  private Class<SEARCH_T> annotationClass;
065
066  private FeatureExtractor1<SEARCH_T> extractor;
067
068  private Context[] contexts;
069
070  /**
071   * Create an extractor that finds {@link Annotation}s of the given type at the specified
072   * {@link Context}s and applies the given feature extractor to the annotations.
073   * 
074   * @param annotationClass
075   *          The type of annotation which should be searched for in the context.
076   * @param extractor
077   *          The feature extractor to apply to each annotation found.
078   * @param contexts
079   *          The contexts where the extractor should look for annotations.
080   */
081  public CleartkExtractor(
082      Class<SEARCH_T> annotationClass,
083      FeatureExtractor1<SEARCH_T> extractor,
084      Context... contexts) {
085    this.annotationClass = annotationClass;
086    this.extractor = extractor;
087    this.contexts = contexts;
088  }
089
090  @Override
091  public List<Feature> extract(JCas view, FOCUS_T focusAnnotation) throws CleartkExtractorException {
092    return this.extract(view, focusAnnotation, new NoBounds());
093  }
094
095  /**
096   * Extract features from the annotations around the focus annotation and within the given bounds.
097   * 
098   * @param view
099   *          The JCas containing the focus annotation.
100   * @param focusAnnotation
101   *          The annotation whose context is to be searched.
102   * @param boundsAnnotation
103   *          The boundary within which context annotations may be identified.
104   * @return The features extracted in the context of the focus annotation.
105   */
106  public List<Feature> extractWithin(JCas view, FOCUS_T focusAnnotation, Annotation boundsAnnotation)
107      throws CleartkExtractorException {
108    Bounds bounds = new SpanBounds(boundsAnnotation.getBegin(), boundsAnnotation.getEnd());
109    return this.extract(view, focusAnnotation, bounds);
110  }
111
112  public List<Feature> extractBetween(JCas view, FOCUS_T annotation1, FOCUS_T annotation2)
113      throws CleartkExtractorException {
114    int begin = annotation1.getEnd();
115    int end = annotation2.getBegin();
116    // FIXME: creating a new annotation may leak memory - is there a better approach?
117    Annotation focusAnnotation = new Annotation(view, begin, end);
118    return this.extract(view, focusAnnotation, new NoBounds());
119  }
120
121  @Override
122  public java.util.List<Feature> extract(JCas view, FOCUS_T annotation1, FOCUS_T annotation2)
123      throws CleartkExtractorException {
124    return this.extractBetween(view, annotation1, annotation2);
125  }
126
127  private List<Feature> extract(JCas view, Annotation focusAnnotation, Bounds bounds)
128      throws CleartkExtractorException {
129    List<Feature> features = new ArrayList<Feature>();
130    for (Context context : this.contexts) {
131      features.addAll(context.extract(
132          view,
133          focusAnnotation,
134          bounds,
135          this.annotationClass,
136          this.extractor));
137    }
138    return features;
139  }
140
141  /**
142   * A class representing the bounds within which a {@link CleartkExtractor} should look for
143   * annotations.
144   */
145  public static interface Bounds {
146
147    /**
148     * Determines whether or not an annotation lies within the given bounds.
149     * 
150     * @param annotation
151     *          The annotation to be checked.
152     * @return True if the annotation lies within the bounds.
153     */
154    public boolean contains(Annotation annotation);
155  }
156
157  /**
158   * A Bounds implementation that puts no restrictions on the context.
159   */
160  private static class NoBounds implements Bounds {
161
162    public NoBounds() {
163    }
164
165    @Override
166    public boolean contains(Annotation annotation) {
167      return true;
168    }
169
170  }
171
172  /**
173   * A Bounds implementation that restricts the context to annotations within a given span.
174   */
175  private static class SpanBounds implements Bounds {
176
177    private int begin;
178
179    private int end;
180
181    public SpanBounds(int begin, int end) {
182      this.begin = begin;
183      this.end = end;
184    }
185
186    @Override
187    public boolean contains(Annotation annotation) {
188      return annotation.getBegin() >= this.begin && annotation.getEnd() <= this.end;
189    }
190
191  }
192
193  /**
194   * A class representing a location that a {@link CleartkExtractor} should look for annotations.
195   */
196  public static interface Context {
197
198    /**
199     * Gets the base feature name that will be used in {@link Feature}s generated by this context.
200     * The actual feature names may include additional information (e.g. relative position), but
201     * this base name should be used in all aggregating contexts like {@link Bag} or {@link Ngram}.
202     * 
203     * @return The base feature name.
204     */
205    public String getName();
206
207    /**
208     * Extracts features in the given context.
209     * 
210     * @param jCas
211     *          The {@link JCas} containing the focus annotation.
212     * @param focusAnnotation
213     *          The annotation whose context is to be searched.
214     * @param annotationClass
215     *          The type of annotation to be found in the context.
216     * @param extractor
217     *          The feature extractor that should be applied to each annotation found in the
218     *          context.
219     * @return The list of features extracted.
220     */
221    public <SEARCH_T extends Annotation> List<Feature> extract(
222        JCas jCas,
223        Annotation focusAnnotation,
224        Bounds bounds,
225        Class<SEARCH_T> annotationClass,
226        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException;
227  }
228
229  /**
230   * A subclass of {@link Feature} that the base feature extractors wrap their features in. This
231   * enables aggregating contexts like {@link Bag} or {@link Ngram} to name their features properly.
232   */
233  private static class ContextFeature extends Feature {
234    private static final long serialVersionUID = 1L;
235
236    public Feature feature;
237
238    public ContextFeature(String baseName, Feature feature) {
239      this.feature = feature;
240      this.setName(Feature.createName(baseName, feature.getName()));
241      this.setValue(this.feature.getValue());
242
243    }
244
245    public ContextFeature(String baseName, int position, Feature feature) {
246      this.feature = feature;
247      this.setName(Feature.createName(baseName, String.valueOf(position), feature.getName()));
248      this.setValue(feature.getValue());
249    }
250
251    public ContextFeature(String baseName, int position, int oobPosition, String featureName) {
252      this.feature = new Feature(featureName, String.format(Locale.ROOT, "OOB%d", oobPosition));
253      this.setName(Feature.createName(baseName, String.valueOf(position), featureName));
254      this.setValue(this.feature.getValue());
255    }
256
257  }
258
259  public static class NestedCountFeature extends ContextFeature {
260    private static final long serialVersionUID = 1L;
261
262    public final Object countedValue;
263
264    public NestedCountFeature(String baseName, Feature feature, Object countedValue) {
265      super(baseName, feature);
266      this.countedValue = countedValue;
267    }
268
269  }
270
271  /**
272   * Base class for simple contexts that have a single begin and end.
273   */
274  private static abstract class Context_ImplBase implements Context {
275    protected int begin;
276
277    protected int end;
278
279    private String name;
280
281    public Context_ImplBase(int begin, int end) {
282      if (begin > end) {
283        String message = "expected begin < end, found begin=%d end=%d";
284        throw new IllegalArgumentException(String.format(message, begin, end));
285      }
286      this.begin = begin;
287      this.end = end;
288      this.name = Feature.createName(
289          this.getClass().getSimpleName(),
290          String.valueOf(this.begin),
291          String.valueOf(this.end));
292    }
293
294    @Override
295    public String getName() {
296      return this.name;
297    }
298
299    /**
300     * Select annotations of the given type in the context of the focus annotation. The returned
301     * annotations should be in order (smaller begin/end offsets before larger begin/end offsets).
302     * 
303     * @param jCas
304     *          The {@link JCas} containing the focus annotation.
305     * @param focusAnnotation
306     *          The annotation whose context is to be searched.
307     * @param annotationClass
308     *          The type of annotation to be found in the context.
309     * @param count
310     *          The number of annotations to select. A smaller number may be returned if it is not
311     *          possible to select the requested number.
312     * @return The annotations in the context of the focus annotation.
313     */
314    protected abstract <T extends Annotation> List<T> select(
315        JCas jCas,
316        Annotation focusAnnotation,
317        Class<T> annotationClass,
318        int count);
319  }
320
321  /**
322   * Base class for simple contexts that scan their annotations from right to left.
323   */
324  private static abstract class RightToLeftContext extends Context_ImplBase {
325
326    public RightToLeftContext(int begin, int end) {
327      super(begin, end);
328    }
329
330    @Override
331    public <SEARCH_T extends Annotation> List<Feature> extract(
332        JCas jCas,
333        Annotation focusAnnotation,
334        Bounds bounds,
335        Class<SEARCH_T> annotationClass,
336        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
337      String featureName = extractor instanceof NamedFeatureExtractor1
338          ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName()
339          : null;
340
341      // slice the appropriate annotations from the CAS
342      List<SEARCH_T> anns = this.select(jCas, focusAnnotation, annotationClass, this.end);
343      int missing = this.end - anns.size();
344      anns = anns.subList(0, Math.max(0, anns.size() - this.begin));
345
346      // figure out how many items are out of bounds
347      int oobPos = missing;
348      for (SEARCH_T ann : anns) {
349        if (!bounds.contains(ann)) {
350          oobPos += 1;
351        }
352      }
353
354      // extract features at each position
355      List<Feature> features = new ArrayList<Feature>();
356      for (int pos = this.end - 1; pos >= this.begin; pos -= 1) {
357
358        // if the annotation at the current position is in bounds, extract features from it
359        int adjustedPos = this.end - 1 - pos - missing;
360        SEARCH_T ann = adjustedPos >= 0 ? anns.get(adjustedPos) : null;
361        if (ann != null && bounds.contains(ann)) {
362          for (Feature feature : extractor.extract(jCas, ann)) {
363            features.add(new ContextFeature(this.getName(), pos, feature));
364          }
365        }
366
367        // if the annotation at the current position is out of bounds, add an out-of-bounds feature
368        else {
369          features.add(new ContextFeature(this.getName(), pos, oobPos, featureName));
370          oobPos -= 1;
371        }
372      }
373      return features;
374    }
375  }
376
377  /**
378   * Base class for simple contexts that scan their annotations from left to right.
379   */
380  private static abstract class LeftToRightContext extends Context_ImplBase {
381
382    public LeftToRightContext(int begin, int end) {
383      super(begin, end);
384    }
385
386    @Override
387    public <SEARCH_T extends Annotation> List<Feature> extract(
388        JCas jCas,
389        Annotation focusAnnotation,
390        Bounds bounds,
391        Class<SEARCH_T> annotationClass,
392        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
393      String featureName = extractor instanceof NamedFeatureExtractor1
394          ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName()
395          : null;
396      List<SEARCH_T> anns = this.select(jCas, focusAnnotation, annotationClass, this.end);
397      int oobStart;
398      if (this.begin <= anns.size()) {
399        oobStart = 1;
400        anns = anns.subList(this.begin, anns.size());
401      } else {
402        oobStart = this.begin - anns.size() + 1;
403        anns = new ArrayList<SEARCH_T>();
404      }
405      List<Feature> features = new ArrayList<Feature>();
406      Iterator<SEARCH_T> iter = anns.iterator();
407      for (int pos = this.begin, oobPos = oobStart; pos < this.end; pos += 1) {
408        SEARCH_T ann = iter.hasNext() ? iter.next() : null;
409        if (ann != null && bounds.contains(ann)) {
410          for (Feature feature : extractor.extract(jCas, ann)) {
411            features.add(new ContextFeature(this.getName(), pos, feature));
412          }
413        } else {
414          features.add(new ContextFeature(this.getName(), pos, oobPos, featureName));
415          oobPos += 1;
416        }
417      }
418      return features;
419    }
420  }
421
422  /**
423   * A {@link Context} for extracting the focus annotation. This is mainly useful when the focus
424   * annotation should be combined with other annotations using, e.g. a {@link Bag} or {@link Ngram}
425   * to aggregate over several contexts.
426   */
427  public static class Focus implements Context {
428
429    private String name;
430
431    /**
432     * Constructs a context that will extract features over the focus annotation.
433     */
434    public Focus() {
435      this.name = this.getClass().getSimpleName();
436    }
437
438    @Override
439    public String getName() {
440      return this.name;
441    }
442
443    @Override
444    public <SEARCH_T extends Annotation> List<Feature> extract(
445        JCas jCas,
446        Annotation focusAnnotation,
447        Bounds bounds,
448        Class<SEARCH_T> annotationClass,
449        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
450
451      List<Feature> features = new ArrayList<Feature>();
452      for (Feature feature : extractor.extract(jCas, annotationClass.cast(focusAnnotation))) {
453        features.add(new ContextFeature(this.getName(), feature));
454      }
455      return features;
456    }
457
458  }
459
460  /**
461   * A {@link Context} for extracting annotations appearing before the focus annotation.
462   */
463  public static class Preceding extends RightToLeftContext {
464
465    /**
466     * Constructs a context that will extract features over the preceding N annotations.
467     * 
468     * @param end
469     *          The number of annotations to extract.
470     */
471    public Preceding(int end) {
472      super(0, end);
473    }
474
475    /**
476     * Constructs a context that will extract features over a slice of the preceding N annotations.
477     * 
478     * The {@code begin} and {@code end} indexes count from 0, where index 0 identifies the
479     * annotation immediately preceding the focus annotation. If either index is greater than the
480     * index of the earliest possible annotation, special "out of bounds" features will be added for
481     * each annotation that was requested but absent.
482     * 
483     * @param begin
484     *          The index of the first annotation to include.
485     * @param end
486     *          The index of the last annotation to include. Must be greater than {@code begin}.
487     */
488    public Preceding(int begin, int end) {
489      super(begin, end);
490    }
491
492    @Override
493    protected <T extends Annotation> List<T> select(
494        JCas jCas,
495        Annotation focusAnnotation,
496        Class<T> annotationClass,
497        int count) {
498      return JCasUtil.selectPreceding(jCas, annotationClass, focusAnnotation, count);
499    }
500  }
501
502  /**
503   * A {@link Context} for extracting annotations appearing after the focus annotation.
504   */
505  public static class Following extends LeftToRightContext {
506
507    /**
508     * Constructs a context that will extract features over the following N annotations.
509     * 
510     * @param end
511     *          The number of annotations to extract.
512     */
513    public Following(int end) {
514      super(0, end);
515    }
516
517    /**
518     * Constructs a context that will extract features over a slice of the following N annotations.
519     * 
520     * The {@code begin} and {@code end} indexes count from 0, where index 0 identifies the
521     * annotation immediately following the focus annotation. If either index is greater than the
522     * index of the last possible annotation, special "out of bounds" features will be added for
523     * each annotation that was requested but absent.
524     * 
525     * @param begin
526     *          The index of the first annotation to include.
527     * @param end
528     *          The index of the last annotation to include. Must be greater than {@code begin}.
529     */
530    public Following(int begin, int end) {
531      super(begin, end);
532    }
533
534    @Override
535    protected <T extends Annotation> List<T> select(
536        JCas jCas,
537        Annotation focusAnnotation,
538        Class<T> annotationClass,
539        int count) {
540      return JCasUtil.selectFollowing(jCas, annotationClass, focusAnnotation, count);
541    }
542  }
543
544  /**
545   * A {@link Context} for extracting all annotations within the focus annotation.
546   */
547  public static class Covered implements Context {
548
549    /**
550     * Constructs a context that will extract features over all annotations within the focus
551     * annotation.
552     */
553    public Covered() {
554    }
555
556    @Override
557    public String getName() {
558      return "Covered";
559    }
560
561    @Override
562    public <SEARCH_T extends Annotation> List<Feature> extract(
563        JCas jCas,
564        Annotation focusAnnotation,
565        Bounds bounds,
566        Class<SEARCH_T> annotationClass,
567        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
568      List<Feature> features = new ArrayList<Feature>();
569      int pos = 0;
570      for (SEARCH_T ann : JCasUtil.selectCovered(jCas, annotationClass, focusAnnotation)) {
571        for (Feature feature : extractor.extract(jCas, ann)) {
572          features.add(new ContextFeature(this.getName(), pos, feature));
573        }
574        pos += 1;
575      }
576      return features;
577    }
578
579  }
580
581  /**
582   * A {@link Context} for extracting the first annotations within the focus annotation.
583   */
584  public static class FirstCovered extends LeftToRightContext {
585
586    /**
587     * Constructs a context that will extract features over the first N annotations within the focus
588     * annotation.
589     * 
590     * @param end
591     *          The number of annotations to extract.
592     */
593    public FirstCovered(int end) {
594      super(0, end);
595    }
596
597    /**
598     * Constructs a context that will extract features over a slice of the first N annotations
599     * within the focus annotation.
600     * 
601     * The {@code begin} and {@code end} indexes count from 0, where index 0 identifies the first
602     * annotation within the focus annotation. If either index is greater than the index of the last
603     * annotation within the focus annotation, special "out of bounds" features will be added for
604     * each annotation that was requested but absent.
605     * 
606     * @param begin
607     *          The index of the first annotation to include.
608     * @param end
609     *          The index of the last annotation to include. Must be greater than {@code begin}.
610     */
611    public FirstCovered(int begin, int end) {
612      super(begin, end);
613    }
614
615    @Override
616    protected <T extends Annotation> List<T> select(
617        JCas jCas,
618        Annotation focusAnnotation,
619        Class<T> annotationClass,
620        int count) {
621      List<T> anns = JCasUtil.selectCovered(jCas, annotationClass, focusAnnotation);
622      return anns.subList(0, Math.min(count, anns.size()));
623    }
624  }
625
626  /**
627   * A {@link Context} for extracting the last annotations within the focus annotation.
628   */
629  public static class LastCovered extends RightToLeftContext {
630
631    /**
632     * Constructs a context that will extract features over the last N annotations within the focus
633     * annotation.
634     * 
635     * @param end
636     *          The number of annotations to extract.
637     */
638    public LastCovered(int end) {
639      super(0, end);
640    }
641
642    /**
643     * Constructs a context that will extract features over a slice of the last N annotations within
644     * the focus annotation.
645     * 
646     * The {@code begin} and {@code end} indexes count from 0, where index 0 identifies the last
647     * annotation within the focus annotation. If either index is greater than the index of the
648     * first annotation within the focus annotation, special "out of bounds" features will be added
649     * for each annotation that was requested but absent.
650     * 
651     * @param begin
652     *          The index of the first annotation to include.
653     * @param end
654     *          The index of the last annotation to include. Must be greater than {@code begin}.
655     */
656    public LastCovered(int begin, int end) {
657      super(begin, end);
658    }
659
660    @Override
661    protected <T extends Annotation> List<T> select(
662        JCas jCas,
663        Annotation focusAnnotation,
664        Class<T> annotationClass,
665        int count) {
666      List<T> anns = JCasUtil.selectCovered(jCas, annotationClass, focusAnnotation);
667      return anns.subList(Math.max(anns.size() - count, 0), anns.size());
668    }
669  }
670
671  /**
672   * A {@link Context} that aggregates the features of other contexts into a "bag" where position
673   * information of each individual feature is no longer maintained. Position information is not
674   * entirely lost - the span of the bag is encoded as part of the feature name that is shared by
675   * all of the features within the bag.
676   */
677  public static class Bag implements Context {
678
679    private Context[] contexts;
680
681    private String name;
682
683    /**
684     * Constructs a {@link Context} which converts the features extracted by the argument contexts
685     * into a bag of features where all features have the same name.
686     * 
687     * @param contexts
688     *          The contexts which should be combined into a bag.
689     */
690    public Bag(Context... contexts) {
691      this.contexts = contexts;
692      String[] names = new String[contexts.length + 1];
693      names[0] = "Bag";
694      for (int i = 1; i < names.length; ++i) {
695        names[i] = contexts[i - 1].getName();
696      }
697      this.name = Feature.createName(names);
698    }
699
700    @Override
701    public String getName() {
702      return this.name;
703    }
704
705    @Override
706    public <SEARCH_T extends Annotation> List<Feature> extract(
707        JCas jCas,
708        Annotation focusAnnotation,
709        Bounds bounds,
710        Class<SEARCH_T> annotationClass,
711        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
712      List<Feature> features = new ArrayList<Feature>();
713      for (Context context : this.contexts) {
714        for (Feature feature : context.extract(
715            jCas,
716            focusAnnotation,
717            bounds,
718            annotationClass,
719            extractor)) {
720          ContextFeature contextFeature = (ContextFeature) feature;
721          Feature f2 = new Feature(contextFeature.feature.getName(), feature.getValue());
722          features.add(new ContextFeature(this.getName(), f2));
723        }
724      }
725      return features;
726    }
727  }
728
729  /**
730   * A {@link Context} that aggregates the features of other contexts into a bag of counts where
731   * only the count of occurrence of each feature value is maintained. The span (offsets) of the bag
732   * of counts is encoded as part of the feature name.
733   */
734  public static class Count implements Context {
735
736    private Context[] contexts;
737
738    private String name;
739
740    /**
741     * Constructs a {@link Context} which converts the features extracted by the argument contexts
742     * into a bag of count features.
743     * 
744     * @param contexts
745     *          The contexts which should be combined into a bag.
746     */
747    public Count(Context... contexts) {
748      this.contexts = contexts;
749      String[] names = new String[contexts.length + 1];
750      names[0] = "Count";
751      for (int i = 1; i < names.length; ++i) {
752        names[i] = contexts[i - 1].getName();
753      }
754      this.name = Feature.createName(names);
755    }
756
757    @Override
758    public String getName() {
759      return this.name;
760    }
761
762    /**
763     * This method got a bit gnarly in order to support nested Count contexts. It isn't clear why
764     * someone would want to do this but we figured that it should just work even if it may be
765     * contrived to set up a CleartkExtractor this way. The problems comes up if there are multiple
766     * nested count contexts and keeping track of what exactly is being counted. The class
767     * NestedContextFeature does this for us.
768     */
769    @Override
770    public <SEARCH_T extends Annotation> List<Feature> extract(
771        JCas jCas,
772        Annotation focusAnnotation,
773        Bounds bounds,
774        Class<SEARCH_T> annotationClass,
775        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
776      Multiset<String> featureCounts = LinkedHashMultiset.create();
777      Map<String, Feature> featureMap = new HashMap<String, Feature>();
778      for (Context context : this.contexts) {
779        for (Feature feature : context.extract(
780            jCas,
781            focusAnnotation,
782            bounds,
783            annotationClass,
784            extractor)) {
785
786          String countedFeatureValue = null;
787          if (feature instanceof NestedCountFeature) {
788            countedFeatureValue = "" + ((NestedCountFeature) feature).countedValue;
789          }
790
791          String extractorName = extractor instanceof NamedFeatureExtractor1
792              ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName()
793              : null;
794
795          String featureName = Feature.createName(
796              this.name,
797              extractorName,
798              countedFeatureValue,
799              String.valueOf(feature.getValue()));
800          featureCounts.add(featureName);
801          featureMap.put(featureName, feature);
802        }
803      }
804      List<Feature> features = new ArrayList<Feature>();
805      for (String featureName : featureCounts.elementSet()) {
806        Feature feature = featureMap.get(featureName);
807        String countedFeatureValue = "" + feature.getValue();
808        if (feature instanceof NestedCountFeature) {
809          countedFeatureValue = ((NestedCountFeature) feature).countedValue + "_"
810              + countedFeatureValue;
811        }
812        features.add(new NestedCountFeature(featureName, new Feature(
813            featureCounts.count(featureName)), countedFeatureValue));
814      }
815      return features;
816    }
817  }
818
819  /**
820   * A {@link Context} that aggregates the features of other contexts into a single "ngram" feature,
821   * where the feature values are concatenated together in order to form a single value.
822   */
823  public static class Ngram implements Context {
824    private Context[] contexts;
825
826    private String name;
827
828    /**
829     * Constructs a {@link Context} which converts the features extracted by the argument contexts
830     * into a single ngram feature where all feature values have been concatenated together. That
831     * is, it takes everything provided by the contexts and makes a single feature value from it.
832     * For example, the code "new Ngram(new Preceding(2), new Following(2)))" if run on token
833     * annotations would return the feature "A_B_D_E" for the token "C" in the text "A B C D E".
834     * That is, it creates a single ngram from the preceding context and following context. Please
835     * see org.cleartk.ml.feature.extractor.CleartkExtractorTest.testNgram() to run this example.
836     * 
837     * 
838     * @param contexts
839     *          The contexts which should be combined into an ngram.
840     */
841    public Ngram(Context... contexts) {
842      this.contexts = contexts;
843      String[] names = new String[contexts.length + 1];
844      names[0] = "Ngram";
845      for (int i = 1; i < names.length; ++i) {
846        names[i] = contexts[i - 1].getName();
847      }
848      this.name = Feature.createName(names);
849    }
850
851    @Override
852    public String getName() {
853      return this.name;
854    }
855
856    @Override
857    public <SEARCH_T extends Annotation> List<Feature> extract(
858        JCas jCas,
859        Annotation focusAnnotation,
860        Bounds bounds,
861        Class<SEARCH_T> annotationClass,
862        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
863      String featureName = extractor instanceof NamedFeatureExtractor1
864          ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName()
865          : null;
866      List<String> values = new ArrayList<String>();
867      for (Context context : this.contexts) {
868        for (Feature feature : context.extract(
869            jCas,
870            focusAnnotation,
871            bounds,
872            annotationClass,
873            extractor)) {
874          values.add(String.valueOf(feature.getValue()));
875        }
876      }
877      Feature feature = new Feature(featureName, Joiner.on('_').join(values));
878      List<Feature> features = new ArrayList<Feature>();
879      features.add(new ContextFeature(this.getName(), feature));
880      return features;
881    }
882  }
883
884  /**
885   * A {@link Context} that aggregates the features of other contexts into several "ngrams"
886   * features, where sub-sequences of the feature values are concatenated together in order to form
887   * single values.
888   */
889  public static class Ngrams implements Context {
890    private int n;
891
892    private Context[] contexts;
893
894    private String name;
895
896    /**
897     * Constructs a {@link Context} which converts the features extracted by the argument contexts
898     * into ngram features where sub-sequences feature values have been concatenated together.
899     * 
900     * For example, Ngrams(2, context) will extract all bigrams of features generated in the given
901     * context.
902     * 
903     * @param n
904     *          The length of the n-gram features
905     * @param contexts
906     *          The contexts which should be combined into an ngram.
907     */
908    public Ngrams(int n, Context... contexts) {
909      this.n = n;
910      this.contexts = contexts;
911      String[] names = new String[contexts.length + 1];
912      names[0] = this.n + "grams";
913      for (int i = 1; i < names.length; ++i) {
914        names[i] = contexts[i - 1].getName();
915      }
916      this.name = Feature.createName(names);
917    }
918
919    @Override
920    public String getName() {
921      return this.name;
922    }
923
924    @Override
925    public <SEARCH_T extends Annotation> List<Feature> extract(
926        JCas jCas,
927        Annotation focusAnnotation,
928        Bounds bounds,
929        Class<SEARCH_T> annotationClass,
930        FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
931      String featureName = extractor instanceof NamedFeatureExtractor1
932          ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName()
933          : null;
934      List<Feature> extractedFeatures = new ArrayList<Feature>();
935      for (Context context : this.contexts) {
936        extractedFeatures.addAll(context.extract(
937            jCas,
938            focusAnnotation,
939            bounds,
940            annotationClass,
941            extractor));
942      }
943      List<Feature> features = new ArrayList<Feature>();
944      for (int i = 0; i < extractedFeatures.size() - this.n + 1; ++i) {
945        List<String> values = new ArrayList<String>();
946        for (Feature feature : extractedFeatures.subList(i, i + this.n)) {
947          values.add(feature.getValue().toString());
948        }
949        Feature feature = new Feature(featureName, Joiner.on('_').join(values));
950        features.add(new ContextFeature(this.getName(), feature));
951      }
952      return features;
953    }
954  }
955}