001/** 
002 * Copyright (c) 2010, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.util.collection;
025
026import java.io.File;
027import java.io.FileWriter;
028import java.io.IOException;
029import java.io.Writer;
030import java.util.HashMap;
031import java.util.Map;
032import java.util.logging.Logger;
033
034/**
035 * <br>
036 * Copyright (c) 2010, Regents of the University of Colorado <br>
037 * All rights reserved.
038 * 
039 * <p>
040 */
041public class GenericStringMapper implements StringMapper, Writable {
042
043  private static final long serialVersionUID = -9129249759791539649L;
044
045  public GenericStringMapper(int cutoff) {
046    this.cutoff = cutoff;
047  }
048
049  public int getOrGenerateInteger(String s) {
050    if (expandMap) {
051      if (countingMap.containsKey(s)) {
052        Entry e = countingMap.get(s);
053        e.increment();
054        return e.i;
055      } else {
056        Entry e = new Entry(nextValue++);
057        countingMap.put(s, e);
058        return e.i;
059      }
060    } else {
061      throw new UnsupportedOperationException();
062    }
063  }
064
065  public int getInteger(String s) throws UnknownKeyException {
066    if (expandMap) {
067      throw new UnsupportedOperationException();
068    } else {
069      if (stringIntMap.containsKey(s))
070        return stringIntMap.get(s);
071      else
072        throw new UnknownKeyException(s);
073    }
074  }
075
076  public void finalizeMap() {
077    int total = 0;
078    int kept = 0;
079
080    stringIntMap = new HashMap<String, Integer>();
081    for (String s : countingMap.keySet()) {
082      Entry e = countingMap.get(s);
083      total += 1;
084
085      if (e.count >= cutoff) {
086        stringIntMap.put(s, e.i);
087        kept += 1;
088      }
089    }
090
091    if (total - kept > 0) {
092      Logger.getLogger(this.getClass().getName()).info(
093          String.format(
094              "discarded %d features that occurred less than %d times; %d features remaining",
095              total - kept,
096              cutoff,
097              kept));
098    }
099
100    countingMap = null;
101    expandMap = false;
102  }
103
104  public void write(File file) throws IOException {
105    Writer writer = new FileWriter(file);
106    write(writer);
107    writer.close();
108  }
109
110  public void write(Writer writer) throws IOException {
111    if (expandMap)
112      throw new UnsupportedOperationException();
113
114    for (String key : stringIntMap.keySet()) {
115      writer.append(String.format("%d %s\n", stringIntMap.get(key), key));
116    }
117
118    writer.flush();
119  }
120
121  boolean expandMap = true;
122
123  int nextValue = 1;
124
125  int cutoff;
126
127  Map<String, Entry> countingMap = new HashMap<String, Entry>();
128
129  Map<String, Integer> stringIntMap = null;
130
131  private static class Entry {
132    public Entry(int i) {
133      this.i = i;
134      this.count = 1;
135    }
136
137    public void increment() {
138      count += 1;
139    }
140
141    public int i;
142
143    public int count;
144  }
145
146}