001/* 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.feature.extractor; 025 026import java.util.ArrayList; 027import java.util.HashMap; 028import java.util.Iterator; 029import java.util.List; 030import java.util.Locale; 031import java.util.Map; 032 033import org.apache.uima.fit.util.JCasUtil; 034import org.apache.uima.jcas.JCas; 035import org.apache.uima.jcas.tcas.Annotation; 036import org.cleartk.ml.Feature; 037 038import com.google.common.base.Joiner; 039import com.google.common.collect.LinkedHashMultiset; 040import com.google.common.collect.Multiset; 041 042/** 043 * A feature extractor that finds other {@link Annotation}s in the context of a focus annotation and 044 * extracts features from these. It can be used, for example, to: 045 * <ul> 046 * <li>Get the text of the 2 tokens before a focus annotation</li> 047 * <li>Get the parts of speech of the 3 tokens after a focus annotation</li> 048 * <li>Get the tokens after a focus annotation, beginning 2 after and ending 5 after, as a bag of 049 * words</li> 050 * <li>Get an ngram concatenating the stem of the first word before a focus annotation and the first 051 * word contained in the focus annotation</li> 052 * <li></li> 053 * </ul> 054 * 055 * <br> 056 * Copyright (c) 2011, Regents of the University of Colorado <br> 057 * All rights reserved. 058 * 059 * @author Steven Bethard 060 */ 061public class CleartkExtractor<FOCUS_T extends Annotation, SEARCH_T extends Annotation> implements 062 FeatureExtractor1<FOCUS_T>, FeatureExtractor2<FOCUS_T, FOCUS_T> { 063 064 private Class<SEARCH_T> annotationClass; 065 066 private FeatureExtractor1<SEARCH_T> extractor; 067 068 private Context[] contexts; 069 070 /** 071 * Create an extractor that finds {@link Annotation}s of the given type at the specified 072 * {@link Context}s and applies the given feature extractor to the annotations. 073 * 074 * @param annotationClass 075 * The type of annotation which should be searched for in the context. 076 * @param extractor 077 * The feature extractor to apply to each annotation found. 078 * @param contexts 079 * The contexts where the extractor should look for annotations. 080 */ 081 public CleartkExtractor( 082 Class<SEARCH_T> annotationClass, 083 FeatureExtractor1<SEARCH_T> extractor, 084 Context... contexts) { 085 this.annotationClass = annotationClass; 086 this.extractor = extractor; 087 this.contexts = contexts; 088 } 089 090 @Override 091 public List<Feature> extract(JCas view, FOCUS_T focusAnnotation) throws CleartkExtractorException { 092 return this.extract(view, focusAnnotation, new NoBounds()); 093 } 094 095 /** 096 * Extract features from the annotations around the focus annotation and within the given bounds. 097 * 098 * @param view 099 * The JCas containing the focus annotation. 100 * @param focusAnnotation 101 * The annotation whose context is to be searched. 102 * @param boundsAnnotation 103 * The boundary within which context annotations may be identified. 104 * @return The features extracted in the context of the focus annotation. 105 */ 106 public List<Feature> extractWithin(JCas view, FOCUS_T focusAnnotation, Annotation boundsAnnotation) 107 throws CleartkExtractorException { 108 Bounds bounds = new SpanBounds(boundsAnnotation.getBegin(), boundsAnnotation.getEnd()); 109 return this.extract(view, focusAnnotation, bounds); 110 } 111 112 public List<Feature> extractBetween(JCas view, FOCUS_T annotation1, FOCUS_T annotation2) 113 throws CleartkExtractorException { 114 int begin = annotation1.getEnd(); 115 int end = annotation2.getBegin(); 116 // FIXME: creating a new annotation may leak memory - is there a better approach? 117 Annotation focusAnnotation = new Annotation(view, begin, end); 118 return this.extract(view, focusAnnotation, new NoBounds()); 119 } 120 121 @Override 122 public java.util.List<Feature> extract(JCas view, FOCUS_T annotation1, FOCUS_T annotation2) 123 throws CleartkExtractorException { 124 return this.extractBetween(view, annotation1, annotation2); 125 } 126 127 private List<Feature> extract(JCas view, Annotation focusAnnotation, Bounds bounds) 128 throws CleartkExtractorException { 129 List<Feature> features = new ArrayList<Feature>(); 130 for (Context context : this.contexts) { 131 features.addAll(context.extract( 132 view, 133 focusAnnotation, 134 bounds, 135 this.annotationClass, 136 this.extractor)); 137 } 138 return features; 139 } 140 141 /** 142 * A class representing the bounds within which a {@link CleartkExtractor} should look for 143 * annotations. 144 */ 145 public static interface Bounds { 146 147 /** 148 * Determines whether or not an annotation lies within the given bounds. 149 * 150 * @param annotation 151 * The annotation to be checked. 152 * @return True if the annotation lies within the bounds. 153 */ 154 public boolean contains(Annotation annotation); 155 } 156 157 /** 158 * A Bounds implementation that puts no restrictions on the context. 159 */ 160 private static class NoBounds implements Bounds { 161 162 public NoBounds() { 163 } 164 165 @Override 166 public boolean contains(Annotation annotation) { 167 return true; 168 } 169 170 } 171 172 /** 173 * A Bounds implementation that restricts the context to annotations within a given span. 174 */ 175 private static class SpanBounds implements Bounds { 176 177 private int begin; 178 179 private int end; 180 181 public SpanBounds(int begin, int end) { 182 this.begin = begin; 183 this.end = end; 184 } 185 186 @Override 187 public boolean contains(Annotation annotation) { 188 return annotation.getBegin() >= this.begin && annotation.getEnd() <= this.end; 189 } 190 191 } 192 193 /** 194 * A class representing a location that a {@link CleartkExtractor} should look for annotations. 195 */ 196 public static interface Context { 197 198 /** 199 * Gets the base feature name that will be used in {@link Feature}s generated by this context. 200 * The actual feature names may include additional information (e.g. relative position), but 201 * this base name should be used in all aggregating contexts like {@link Bag} or {@link Ngram}. 202 * 203 * @return The base feature name. 204 */ 205 public String getName(); 206 207 /** 208 * Extracts features in the given context. 209 * 210 * @param jCas 211 * The {@link JCas} containing the focus annotation. 212 * @param focusAnnotation 213 * The annotation whose context is to be searched. 214 * @param annotationClass 215 * The type of annotation to be found in the context. 216 * @param extractor 217 * The feature extractor that should be applied to each annotation found in the 218 * context. 219 * @return The list of features extracted. 220 */ 221 public <SEARCH_T extends Annotation> List<Feature> extract( 222 JCas jCas, 223 Annotation focusAnnotation, 224 Bounds bounds, 225 Class<SEARCH_T> annotationClass, 226 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException; 227 } 228 229 /** 230 * A subclass of {@link Feature} that the base feature extractors wrap their features in. This 231 * enables aggregating contexts like {@link Bag} or {@link Ngram} to name their features properly. 232 */ 233 private static class ContextFeature extends Feature { 234 private static final long serialVersionUID = 1L; 235 236 public Feature feature; 237 238 public ContextFeature(String baseName, Feature feature) { 239 this.feature = feature; 240 this.setName(Feature.createName(baseName, feature.getName())); 241 this.setValue(this.feature.getValue()); 242 243 } 244 245 public ContextFeature(String baseName, int position, Feature feature) { 246 this.feature = feature; 247 this.setName(Feature.createName(baseName, String.valueOf(position), feature.getName())); 248 this.setValue(feature.getValue()); 249 } 250 251 public ContextFeature(String baseName, int position, int oobPosition, String featureName) { 252 this.feature = new Feature(featureName, String.format(Locale.ROOT, "OOB%d", oobPosition)); 253 this.setName(Feature.createName(baseName, String.valueOf(position), featureName)); 254 this.setValue(this.feature.getValue()); 255 } 256 257 } 258 259 public static class NestedCountFeature extends ContextFeature { 260 private static final long serialVersionUID = 1L; 261 262 public final Object countedValue; 263 264 public NestedCountFeature(String baseName, Feature feature, Object countedValue) { 265 super(baseName, feature); 266 this.countedValue = countedValue; 267 } 268 269 } 270 271 /** 272 * Base class for simple contexts that have a single begin and end. 273 */ 274 private static abstract class Context_ImplBase implements Context { 275 protected int begin; 276 277 protected int end; 278 279 private String name; 280 281 public Context_ImplBase(int begin, int end) { 282 if (begin > end) { 283 String message = "expected begin < end, found begin=%d end=%d"; 284 throw new IllegalArgumentException(String.format(message, begin, end)); 285 } 286 this.begin = begin; 287 this.end = end; 288 this.name = Feature.createName( 289 this.getClass().getSimpleName(), 290 String.valueOf(this.begin), 291 String.valueOf(this.end)); 292 } 293 294 @Override 295 public String getName() { 296 return this.name; 297 } 298 299 /** 300 * Select annotations of the given type in the context of the focus annotation. The returned 301 * annotations should be in order (smaller begin/end offsets before larger begin/end offsets). 302 * 303 * @param jCas 304 * The {@link JCas} containing the focus annotation. 305 * @param focusAnnotation 306 * The annotation whose context is to be searched. 307 * @param annotationClass 308 * The type of annotation to be found in the context. 309 * @param count 310 * The number of annotations to select. A smaller number may be returned if it is not 311 * possible to select the requested number. 312 * @return The annotations in the context of the focus annotation. 313 */ 314 protected abstract <T extends Annotation> List<T> select( 315 JCas jCas, 316 Annotation focusAnnotation, 317 Class<T> annotationClass, 318 int count); 319 } 320 321 /** 322 * Base class for simple contexts that scan their annotations from right to left. 323 */ 324 private static abstract class RightToLeftContext extends Context_ImplBase { 325 326 public RightToLeftContext(int begin, int end) { 327 super(begin, end); 328 } 329 330 @Override 331 public <SEARCH_T extends Annotation> List<Feature> extract( 332 JCas jCas, 333 Annotation focusAnnotation, 334 Bounds bounds, 335 Class<SEARCH_T> annotationClass, 336 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException { 337 String featureName = extractor instanceof NamedFeatureExtractor1 338 ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName() 339 : null; 340 341 // slice the appropriate annotations from the CAS 342 List<SEARCH_T> anns = this.select(jCas, focusAnnotation, annotationClass, this.end); 343 int missing = this.end - anns.size(); 344 anns = anns.subList(0, Math.max(0, anns.size() - this.begin)); 345 346 // figure out how many items are out of bounds 347 int oobPos = missing; 348 for (SEARCH_T ann : anns) { 349 if (!bounds.contains(ann)) { 350 oobPos += 1; 351 } 352 } 353 354 // extract features at each position 355 List<Feature> features = new ArrayList<Feature>(); 356 for (int pos = this.end - 1; pos >= this.begin; pos -= 1) { 357 358 // if the annotation at the current position is in bounds, extract features from it 359 int adjustedPos = this.end - 1 - pos - missing; 360 SEARCH_T ann = adjustedPos >= 0 ? anns.get(adjustedPos) : null; 361 if (ann != null && bounds.contains(ann)) { 362 for (Feature feature : extractor.extract(jCas, ann)) { 363 features.add(new ContextFeature(this.getName(), pos, feature)); 364 } 365 } 366 367 // if the annotation at the current position is out of bounds, add an out-of-bounds feature 368 else { 369 features.add(new ContextFeature(this.getName(), pos, oobPos, featureName)); 370 oobPos -= 1; 371 } 372 } 373 return features; 374 } 375 } 376 377 /** 378 * Base class for simple contexts that scan their annotations from left to right. 379 */ 380 private static abstract class LeftToRightContext extends Context_ImplBase { 381 382 public LeftToRightContext(int begin, int end) { 383 super(begin, end); 384 } 385 386 @Override 387 public <SEARCH_T extends Annotation> List<Feature> extract( 388 JCas jCas, 389 Annotation focusAnnotation, 390 Bounds bounds, 391 Class<SEARCH_T> annotationClass, 392 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException { 393 String featureName = extractor instanceof NamedFeatureExtractor1 394 ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName() 395 : null; 396 List<SEARCH_T> anns = this.select(jCas, focusAnnotation, annotationClass, this.end); 397 int oobStart; 398 if (this.begin <= anns.size()) { 399 oobStart = 1; 400 anns = anns.subList(this.begin, anns.size()); 401 } else { 402 oobStart = this.begin - anns.size() + 1; 403 anns = new ArrayList<SEARCH_T>(); 404 } 405 List<Feature> features = new ArrayList<Feature>(); 406 Iterator<SEARCH_T> iter = anns.iterator(); 407 for (int pos = this.begin, oobPos = oobStart; pos < this.end; pos += 1) { 408 SEARCH_T ann = iter.hasNext() ? iter.next() : null; 409 if (ann != null && bounds.contains(ann)) { 410 for (Feature feature : extractor.extract(jCas, ann)) { 411 features.add(new ContextFeature(this.getName(), pos, feature)); 412 } 413 } else { 414 features.add(new ContextFeature(this.getName(), pos, oobPos, featureName)); 415 oobPos += 1; 416 } 417 } 418 return features; 419 } 420 } 421 422 /** 423 * A {@link Context} for extracting the focus annotation. This is mainly useful when the focus 424 * annotation should be combined with other annotations using, e.g. a {@link Bag} or {@link Ngram} 425 * to aggregate over several contexts. 426 */ 427 public static class Focus implements Context { 428 429 private String name; 430 431 /** 432 * Constructs a context that will extract features over the focus annotation. 433 */ 434 public Focus() { 435 this.name = this.getClass().getSimpleName(); 436 } 437 438 @Override 439 public String getName() { 440 return this.name; 441 } 442 443 @Override 444 public <SEARCH_T extends Annotation> List<Feature> extract( 445 JCas jCas, 446 Annotation focusAnnotation, 447 Bounds bounds, 448 Class<SEARCH_T> annotationClass, 449 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException { 450 451 List<Feature> features = new ArrayList<Feature>(); 452 for (Feature feature : extractor.extract(jCas, annotationClass.cast(focusAnnotation))) { 453 features.add(new ContextFeature(this.getName(), feature)); 454 } 455 return features; 456 } 457 458 } 459 460 /** 461 * A {@link Context} for extracting annotations appearing before the focus annotation. 462 */ 463 public static class Preceding extends RightToLeftContext { 464 465 /** 466 * Constructs a context that will extract features over the preceding N annotations. 467 * 468 * @param end 469 * The number of annotations to extract. 470 */ 471 public Preceding(int end) { 472 super(0, end); 473 } 474 475 /** 476 * Constructs a context that will extract features over a slice of the preceding N annotations. 477 * 478 * The {@code begin} and {@code end} indexes count from 0, where index 0 identifies the 479 * annotation immediately preceding the focus annotation. If either index is greater than the 480 * index of the earliest possible annotation, special "out of bounds" features will be added for 481 * each annotation that was requested but absent. 482 * 483 * @param begin 484 * The index of the first annotation to include. 485 * @param end 486 * The index of the last annotation to include. Must be greater than {@code begin}. 487 */ 488 public Preceding(int begin, int end) { 489 super(begin, end); 490 } 491 492 @Override 493 protected <T extends Annotation> List<T> select( 494 JCas jCas, 495 Annotation focusAnnotation, 496 Class<T> annotationClass, 497 int count) { 498 return JCasUtil.selectPreceding(jCas, annotationClass, focusAnnotation, count); 499 } 500 } 501 502 /** 503 * A {@link Context} for extracting annotations appearing after the focus annotation. 504 */ 505 public static class Following extends LeftToRightContext { 506 507 /** 508 * Constructs a context that will extract features over the following N annotations. 509 * 510 * @param end 511 * The number of annotations to extract. 512 */ 513 public Following(int end) { 514 super(0, end); 515 } 516 517 /** 518 * Constructs a context that will extract features over a slice of the following N annotations. 519 * 520 * The {@code begin} and {@code end} indexes count from 0, where index 0 identifies the 521 * annotation immediately following the focus annotation. If either index is greater than the 522 * index of the last possible annotation, special "out of bounds" features will be added for 523 * each annotation that was requested but absent. 524 * 525 * @param begin 526 * The index of the first annotation to include. 527 * @param end 528 * The index of the last annotation to include. Must be greater than {@code begin}. 529 */ 530 public Following(int begin, int end) { 531 super(begin, end); 532 } 533 534 @Override 535 protected <T extends Annotation> List<T> select( 536 JCas jCas, 537 Annotation focusAnnotation, 538 Class<T> annotationClass, 539 int count) { 540 return JCasUtil.selectFollowing(jCas, annotationClass, focusAnnotation, count); 541 } 542 } 543 544 /** 545 * A {@link Context} for extracting all annotations within the focus annotation. 546 */ 547 public static class Covered implements Context { 548 549 /** 550 * Constructs a context that will extract features over all annotations within the focus 551 * annotation. 552 */ 553 public Covered() { 554 } 555 556 @Override 557 public String getName() { 558 return "Covered"; 559 } 560 561 @Override 562 public <SEARCH_T extends Annotation> List<Feature> extract( 563 JCas jCas, 564 Annotation focusAnnotation, 565 Bounds bounds, 566 Class<SEARCH_T> annotationClass, 567 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException { 568 List<Feature> features = new ArrayList<Feature>(); 569 int pos = 0; 570 for (SEARCH_T ann : JCasUtil.selectCovered(jCas, annotationClass, focusAnnotation)) { 571 for (Feature feature : extractor.extract(jCas, ann)) { 572 features.add(new ContextFeature(this.getName(), pos, feature)); 573 } 574 pos += 1; 575 } 576 return features; 577 } 578 579 } 580 581 /** 582 * A {@link Context} for extracting the first annotations within the focus annotation. 583 */ 584 public static class FirstCovered extends LeftToRightContext { 585 586 /** 587 * Constructs a context that will extract features over the first N annotations within the focus 588 * annotation. 589 * 590 * @param end 591 * The number of annotations to extract. 592 */ 593 public FirstCovered(int end) { 594 super(0, end); 595 } 596 597 /** 598 * Constructs a context that will extract features over a slice of the first N annotations 599 * within the focus annotation. 600 * 601 * The {@code begin} and {@code end} indexes count from 0, where index 0 identifies the first 602 * annotation within the focus annotation. If either index is greater than the index of the last 603 * annotation within the focus annotation, special "out of bounds" features will be added for 604 * each annotation that was requested but absent. 605 * 606 * @param begin 607 * The index of the first annotation to include. 608 * @param end 609 * The index of the last annotation to include. Must be greater than {@code begin}. 610 */ 611 public FirstCovered(int begin, int end) { 612 super(begin, end); 613 } 614 615 @Override 616 protected <T extends Annotation> List<T> select( 617 JCas jCas, 618 Annotation focusAnnotation, 619 Class<T> annotationClass, 620 int count) { 621 List<T> anns = JCasUtil.selectCovered(jCas, annotationClass, focusAnnotation); 622 return anns.subList(0, Math.min(count, anns.size())); 623 } 624 } 625 626 /** 627 * A {@link Context} for extracting the last annotations within the focus annotation. 628 */ 629 public static class LastCovered extends RightToLeftContext { 630 631 /** 632 * Constructs a context that will extract features over the last N annotations within the focus 633 * annotation. 634 * 635 * @param end 636 * The number of annotations to extract. 637 */ 638 public LastCovered(int end) { 639 super(0, end); 640 } 641 642 /** 643 * Constructs a context that will extract features over a slice of the last N annotations within 644 * the focus annotation. 645 * 646 * The {@code begin} and {@code end} indexes count from 0, where index 0 identifies the last 647 * annotation within the focus annotation. If either index is greater than the index of the 648 * first annotation within the focus annotation, special "out of bounds" features will be added 649 * for each annotation that was requested but absent. 650 * 651 * @param begin 652 * The index of the first annotation to include. 653 * @param end 654 * The index of the last annotation to include. Must be greater than {@code begin}. 655 */ 656 public LastCovered(int begin, int end) { 657 super(begin, end); 658 } 659 660 @Override 661 protected <T extends Annotation> List<T> select( 662 JCas jCas, 663 Annotation focusAnnotation, 664 Class<T> annotationClass, 665 int count) { 666 List<T> anns = JCasUtil.selectCovered(jCas, annotationClass, focusAnnotation); 667 return anns.subList(Math.max(anns.size() - count, 0), anns.size()); 668 } 669 } 670 671 /** 672 * A {@link Context} that aggregates the features of other contexts into a "bag" where position 673 * information of each individual feature is no longer maintained. Position information is not 674 * entirely lost - the span of the bag is encoded as part of the feature name that is shared by 675 * all of the features within the bag. 676 */ 677 public static class Bag implements Context { 678 679 private Context[] contexts; 680 681 private String name; 682 683 /** 684 * Constructs a {@link Context} which converts the features extracted by the argument contexts 685 * into a bag of features where all features have the same name. 686 * 687 * @param contexts 688 * The contexts which should be combined into a bag. 689 */ 690 public Bag(Context... contexts) { 691 this.contexts = contexts; 692 String[] names = new String[contexts.length + 1]; 693 names[0] = "Bag"; 694 for (int i = 1; i < names.length; ++i) { 695 names[i] = contexts[i - 1].getName(); 696 } 697 this.name = Feature.createName(names); 698 } 699 700 @Override 701 public String getName() { 702 return this.name; 703 } 704 705 @Override 706 public <SEARCH_T extends Annotation> List<Feature> extract( 707 JCas jCas, 708 Annotation focusAnnotation, 709 Bounds bounds, 710 Class<SEARCH_T> annotationClass, 711 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException { 712 List<Feature> features = new ArrayList<Feature>(); 713 for (Context context : this.contexts) { 714 for (Feature feature : context.extract( 715 jCas, 716 focusAnnotation, 717 bounds, 718 annotationClass, 719 extractor)) { 720 ContextFeature contextFeature = (ContextFeature) feature; 721 Feature f2 = new Feature(contextFeature.feature.getName(), feature.getValue()); 722 features.add(new ContextFeature(this.getName(), f2)); 723 } 724 } 725 return features; 726 } 727 } 728 729 /** 730 * A {@link Context} that aggregates the features of other contexts into a bag of counts where 731 * only the count of occurrence of each feature value is maintained. The span (offsets) of the bag 732 * of counts is encoded as part of the feature name. 733 */ 734 public static class Count implements Context { 735 736 private Context[] contexts; 737 738 private String name; 739 740 /** 741 * Constructs a {@link Context} which converts the features extracted by the argument contexts 742 * into a bag of count features. 743 * 744 * @param contexts 745 * The contexts which should be combined into a bag. 746 */ 747 public Count(Context... contexts) { 748 this.contexts = contexts; 749 String[] names = new String[contexts.length + 1]; 750 names[0] = "Count"; 751 for (int i = 1; i < names.length; ++i) { 752 names[i] = contexts[i - 1].getName(); 753 } 754 this.name = Feature.createName(names); 755 } 756 757 @Override 758 public String getName() { 759 return this.name; 760 } 761 762 /** 763 * This method got a bit gnarly in order to support nested Count contexts. It isn't clear why 764 * someone would want to do this but we figured that it should just work even if it may be 765 * contrived to set up a CleartkExtractor this way. The problems comes up if there are multiple 766 * nested count contexts and keeping track of what exactly is being counted. The class 767 * NestedContextFeature does this for us. 768 */ 769 @Override 770 public <SEARCH_T extends Annotation> List<Feature> extract( 771 JCas jCas, 772 Annotation focusAnnotation, 773 Bounds bounds, 774 Class<SEARCH_T> annotationClass, 775 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException { 776 Multiset<String> featureCounts = LinkedHashMultiset.create(); 777 Map<String, Feature> featureMap = new HashMap<String, Feature>(); 778 for (Context context : this.contexts) { 779 for (Feature feature : context.extract( 780 jCas, 781 focusAnnotation, 782 bounds, 783 annotationClass, 784 extractor)) { 785 786 String countedFeatureValue = null; 787 if (feature instanceof NestedCountFeature) { 788 countedFeatureValue = "" + ((NestedCountFeature) feature).countedValue; 789 } 790 791 String extractorName = extractor instanceof NamedFeatureExtractor1 792 ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName() 793 : null; 794 795 String featureName = Feature.createName( 796 this.name, 797 extractorName, 798 countedFeatureValue, 799 String.valueOf(feature.getValue())); 800 featureCounts.add(featureName); 801 featureMap.put(featureName, feature); 802 } 803 } 804 List<Feature> features = new ArrayList<Feature>(); 805 for (String featureName : featureCounts.elementSet()) { 806 Feature feature = featureMap.get(featureName); 807 String countedFeatureValue = "" + feature.getValue(); 808 if (feature instanceof NestedCountFeature) { 809 countedFeatureValue = ((NestedCountFeature) feature).countedValue + "_" 810 + countedFeatureValue; 811 } 812 features.add(new NestedCountFeature(featureName, new Feature( 813 featureCounts.count(featureName)), countedFeatureValue)); 814 } 815 return features; 816 } 817 } 818 819 /** 820 * A {@link Context} that aggregates the features of other contexts into a single "ngram" feature, 821 * where the feature values are concatenated together in order to form a single value. 822 */ 823 public static class Ngram implements Context { 824 private Context[] contexts; 825 826 private String name; 827 828 /** 829 * Constructs a {@link Context} which converts the features extracted by the argument contexts 830 * into a single ngram feature where all feature values have been concatenated together. That 831 * is, it takes everything provided by the contexts and makes a single feature value from it. 832 * For example, the code "new Ngram(new Preceding(2), new Following(2)))" if run on token 833 * annotations would return the feature "A_B_D_E" for the token "C" in the text "A B C D E". 834 * That is, it creates a single ngram from the preceding context and following context. Please 835 * see org.cleartk.ml.feature.extractor.CleartkExtractorTest.testNgram() to run this example. 836 * 837 * 838 * @param contexts 839 * The contexts which should be combined into an ngram. 840 */ 841 public Ngram(Context... contexts) { 842 this.contexts = contexts; 843 String[] names = new String[contexts.length + 1]; 844 names[0] = "Ngram"; 845 for (int i = 1; i < names.length; ++i) { 846 names[i] = contexts[i - 1].getName(); 847 } 848 this.name = Feature.createName(names); 849 } 850 851 @Override 852 public String getName() { 853 return this.name; 854 } 855 856 @Override 857 public <SEARCH_T extends Annotation> List<Feature> extract( 858 JCas jCas, 859 Annotation focusAnnotation, 860 Bounds bounds, 861 Class<SEARCH_T> annotationClass, 862 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException { 863 String featureName = extractor instanceof NamedFeatureExtractor1 864 ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName() 865 : null; 866 List<String> values = new ArrayList<String>(); 867 for (Context context : this.contexts) { 868 for (Feature feature : context.extract( 869 jCas, 870 focusAnnotation, 871 bounds, 872 annotationClass, 873 extractor)) { 874 values.add(String.valueOf(feature.getValue())); 875 } 876 } 877 Feature feature = new Feature(featureName, Joiner.on('_').join(values)); 878 List<Feature> features = new ArrayList<Feature>(); 879 features.add(new ContextFeature(this.getName(), feature)); 880 return features; 881 } 882 } 883 884 /** 885 * A {@link Context} that aggregates the features of other contexts into several "ngrams" 886 * features, where sub-sequences of the feature values are concatenated together in order to form 887 * single values. 888 */ 889 public static class Ngrams implements Context { 890 private int n; 891 892 private Context[] contexts; 893 894 private String name; 895 896 /** 897 * Constructs a {@link Context} which converts the features extracted by the argument contexts 898 * into ngram features where sub-sequences feature values have been concatenated together. 899 * 900 * For example, Ngrams(2, context) will extract all bigrams of features generated in the given 901 * context. 902 * 903 * @param n 904 * The length of the n-gram features 905 * @param contexts 906 * The contexts which should be combined into an ngram. 907 */ 908 public Ngrams(int n, Context... contexts) { 909 this.n = n; 910 this.contexts = contexts; 911 String[] names = new String[contexts.length + 1]; 912 names[0] = this.n + "grams"; 913 for (int i = 1; i < names.length; ++i) { 914 names[i] = contexts[i - 1].getName(); 915 } 916 this.name = Feature.createName(names); 917 } 918 919 @Override 920 public String getName() { 921 return this.name; 922 } 923 924 @Override 925 public <SEARCH_T extends Annotation> List<Feature> extract( 926 JCas jCas, 927 Annotation focusAnnotation, 928 Bounds bounds, 929 Class<SEARCH_T> annotationClass, 930 FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException { 931 String featureName = extractor instanceof NamedFeatureExtractor1 932 ? ((NamedFeatureExtractor1<SEARCH_T>) extractor).getFeatureName() 933 : null; 934 List<Feature> extractedFeatures = new ArrayList<Feature>(); 935 for (Context context : this.contexts) { 936 extractedFeatures.addAll(context.extract( 937 jCas, 938 focusAnnotation, 939 bounds, 940 annotationClass, 941 extractor)); 942 } 943 List<Feature> features = new ArrayList<Feature>(); 944 for (int i = 0; i < extractedFeatures.size() - this.n + 1; ++i) { 945 List<String> values = new ArrayList<String>(); 946 for (Feature feature : extractedFeatures.subList(i, i + this.n)) { 947 values.add(feature.getValue().toString()); 948 } 949 Feature feature = new Feature(featureName, Joiner.on('_').join(values)); 950 features.add(new ContextFeature(this.getName(), feature)); 951 } 952 return features; 953 } 954 } 955}