001/** 002 * Copyright (c) 2010, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.util.collection; 025 026import java.io.File; 027import java.io.FileWriter; 028import java.io.IOException; 029import java.io.Writer; 030import java.util.HashMap; 031import java.util.Map; 032import java.util.logging.Logger; 033 034/** 035 * <br> 036 * Copyright (c) 2010, Regents of the University of Colorado <br> 037 * All rights reserved. 038 * 039 * <p> 040 */ 041public class GenericStringMapper implements StringMapper, Writable { 042 043 private static final long serialVersionUID = -9129249759791539649L; 044 045 public GenericStringMapper(int cutoff) { 046 this.cutoff = cutoff; 047 } 048 049 public int getOrGenerateInteger(String s) { 050 if (expandMap) { 051 if (countingMap.containsKey(s)) { 052 Entry e = countingMap.get(s); 053 e.increment(); 054 return e.i; 055 } else { 056 Entry e = new Entry(nextValue++); 057 countingMap.put(s, e); 058 return e.i; 059 } 060 } else { 061 throw new UnsupportedOperationException(); 062 } 063 } 064 065 public int getInteger(String s) throws UnknownKeyException { 066 if (expandMap) { 067 throw new UnsupportedOperationException(); 068 } else { 069 if (stringIntMap.containsKey(s)) 070 return stringIntMap.get(s); 071 else 072 throw new UnknownKeyException(s); 073 } 074 } 075 076 public void finalizeMap() { 077 int total = 0; 078 int kept = 0; 079 080 stringIntMap = new HashMap<String, Integer>(); 081 for (String s : countingMap.keySet()) { 082 Entry e = countingMap.get(s); 083 total += 1; 084 085 if (e.count >= cutoff) { 086 stringIntMap.put(s, e.i); 087 kept += 1; 088 } 089 } 090 091 if (total - kept > 0) { 092 Logger.getLogger(this.getClass().getName()).info( 093 String.format( 094 "discarded %d features that occurred less than %d times; %d features remaining", 095 total - kept, 096 cutoff, 097 kept)); 098 } 099 100 countingMap = null; 101 expandMap = false; 102 } 103 104 public void write(File file) throws IOException { 105 Writer writer = new FileWriter(file); 106 write(writer); 107 writer.close(); 108 } 109 110 public void write(Writer writer) throws IOException { 111 if (expandMap) 112 throw new UnsupportedOperationException(); 113 114 for (String key : stringIntMap.keySet()) { 115 writer.append(String.format("%d %s\n", stringIntMap.get(key), key)); 116 } 117 118 writer.flush(); 119 } 120 121 boolean expandMap = true; 122 123 int nextValue = 1; 124 125 int cutoff; 126 127 Map<String, Entry> countingMap = new HashMap<String, Entry>(); 128 129 Map<String, Integer> stringIntMap = null; 130 131 private static class Entry { 132 public Entry(int i) { 133 this.i = i; 134 this.count = 1; 135 } 136 137 public void increment() { 138 count += 1; 139 } 140 141 public int i; 142 143 public int count; 144 } 145 146}