001/* 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.timeml; 025 026import java.io.File; 027import java.io.FileNotFoundException; 028import java.io.IOException; 029import java.net.URI; 030import java.net.URISyntaxException; 031import java.util.ArrayList; 032import java.util.Arrays; 033import java.util.HashMap; 034import java.util.HashSet; 035import java.util.List; 036import java.util.Map; 037import java.util.Set; 038 039import org.apache.uima.UimaContext; 040import org.apache.uima.cas.CASException; 041import org.apache.uima.collection.CollectionException; 042import org.apache.uima.collection.CollectionReader; 043import org.apache.uima.jcas.JCas; 044import org.apache.uima.resource.ResourceInitializationException; 045import org.apache.uima.util.Progress; 046import org.apache.uima.util.ProgressImpl; 047import org.cleartk.util.ViewUriUtil; 048import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 049import org.apache.uima.fit.descriptor.ConfigurationParameter; 050import org.apache.uima.fit.factory.CollectionReaderFactory; 051 052import com.google.common.base.Charsets; 053import com.google.common.collect.ArrayListMultimap; 054import com.google.common.collect.ListMultimap; 055import com.google.common.io.Files; 056 057/** 058 * <br> 059 * Copyright (c) 2011, Regents of the University of Colorado <br> 060 * All rights reserved. 061 * 062 * @author Steven Bethard 063 */ 064public class TempEval2010CollectionReader extends JCasCollectionReader_ImplBase { 065 066 public static final String BASE_SEGMENTATION_VIEW_NAME = "base-segmentation.tab"; 067 068 public static final String DCT_VIEW_NAME = "dct.txt"; 069 070 public static final String EVENT_EXTENTS_VIEW_NAME = "event-extents.tab"; 071 072 public static final String EVENT_ATTRIBUTES_VIEW_NAME = "event-attributes.tab"; 073 074 public static final String TIMEX_EXTENTS_VIEW_NAME = "timex-extents.tab"; 075 076 public static final String TIMEX_ATTRIBUTES_VIEW_NAME = "timex-attributes.tab"; 077 078 public static final String TLINK_DCT_EVENT_VIEW_NAME = "tlinks-dct-event.tab"; 079 080 public static final String TLINK_MAIN_EVENTS_VIEW_NAME = "tlinks-main-events.tab"; 081 082 public static final String TLINK_SUBORDINATED_EVENTS_VIEW_NAME = "tlinks-subordinated-events.tab"; 083 084 public static final String TLINK_TIMEX_EVENT_VIEW_NAME = "tlinks-timex-event.tab"; 085 086 public static CollectionReader getCollectionReader(String... dataPaths) 087 throws ResourceInitializationException { 088 List<File> dirs = new ArrayList<File>(); 089 for (String path : dataPaths) { 090 dirs.add(new File(path)); 091 } 092 return getCollectionReader(dirs); 093 } 094 095 public static CollectionReader getCollectionReader(List<File> dataDirectories) 096 throws ResourceInitializationException { 097 return getCollectionReader(dataDirectories, null); 098 } 099 100 public static CollectionReader getCollectionReader( 101 List<File> dataDirectories, 102 Set<String> selectedFileNames) throws ResourceInitializationException { 103 // workaround UimaFIT limitation 104 List<String> dirsList = new ArrayList<String>(); 105 for (File dir : dataDirectories) { 106 dirsList.add(dir.getPath()); 107 } 108 String[] dirs = dirsList.toArray(new String[dirsList.size()]); 109 String[] names = selectedFileNames == null 110 ? null 111 : selectedFileNames.toArray(new String[selectedFileNames.size()]); 112 return CollectionReaderFactory.createReader( 113 TempEval2010CollectionReader.class, 114 null, 115 PARAM_DATA_DIRECTORIES, 116 dirs, 117 PARAM_SELECTED_FILE_NAMES, 118 names); 119 } 120 121 @ConfigurationParameter( 122 name = PARAM_DATA_DIRECTORIES, 123 mandatory = true, 124 description = "The directories containing the TempEval " 125 + "2010 data, e.g. \"tempeval-training-2/english\" and \"tempeval2-test/english\"") 126 protected List<File> dataDirectories; 127 128 public static final String PARAM_DATA_DIRECTORIES = "dataDirectories"; 129 130 @ConfigurationParameter( 131 name = PARAM_SELECTED_FILE_NAMES, 132 mandatory = false, 133 description = "The names of files that should be included when reading, " 134 + "e.g \"ABC19980108.1830.0711\". If null, then all files in the dataset will be included.") 135 protected Set<String> selectedFileNames; 136 137 public static final String PARAM_SELECTED_FILE_NAMES = "selectedFileNames"; 138 139 protected List<URI> uris; 140 141 protected int uriIndex; 142 143 private Map<String, Map<String, String>> viewFileTexts; 144 145 @Override 146 public void initialize(UimaContext context) throws ResourceInitializationException { 147 super.initialize(context); 148 149 try { 150 // assemble URIs for each file in the data 151 this.uriIndex = 0; 152 this.uris = new ArrayList<URI>(); 153 for (File dataDirectory : this.dataDirectories) { 154 URI dataURI = dataDirectory.toURI(); 155 for (String fileName : getAnnotatedFileNames(dataDirectory)) { 156 if (this.selectedFileNames == null || this.selectedFileNames.contains(fileName)) { 157 URI uri = new URI(dataURI.getScheme(), dataURI.getHost(), dataURI.getPath(), fileName); 158 this.uris.add(uri); 159 } 160 } 161 } 162 163 // group lines by filename 164 this.viewFileTexts = new HashMap<String, Map<String, String>>(); 165 for (String viewName : Arrays.asList( 166 BASE_SEGMENTATION_VIEW_NAME, 167 DCT_VIEW_NAME, 168 EVENT_EXTENTS_VIEW_NAME, 169 EVENT_ATTRIBUTES_VIEW_NAME, 170 TIMEX_EXTENTS_VIEW_NAME, 171 TIMEX_ATTRIBUTES_VIEW_NAME, 172 TLINK_DCT_EVENT_VIEW_NAME, 173 TLINK_MAIN_EVENTS_VIEW_NAME, 174 TLINK_SUBORDINATED_EVENTS_VIEW_NAME, 175 TLINK_TIMEX_EVENT_VIEW_NAME)) { 176 // assumes view names are the same as the .tab file names 177 this.viewFileTexts.put(viewName, this.textByFileName(viewName)); 178 } 179 } catch (IOException e) { 180 throw new ResourceInitializationException(e); 181 } catch (URISyntaxException e) { 182 throw new ResourceInitializationException(e); 183 } 184 } 185 186 @Override 187 public boolean hasNext() throws IOException, CollectionException { 188 return this.uriIndex < this.uris.size(); 189 } 190 191 @Override 192 public void getNext(JCas jCas) throws IOException, CollectionException { 193 URI uri = this.uris.get(this.uriIndex); 194 this.uriIndex += 1; 195 ViewUriUtil.setURI(jCas, uri); 196 197 String fileName = uri.getFragment(); 198 for (String viewName : this.viewFileTexts.keySet()) { 199 JCas view; 200 try { 201 view = jCas.createView(viewName); 202 } catch (CASException e) { 203 throw new CollectionException(e); 204 } 205 String text = this.viewFileTexts.get(viewName).get(fileName); 206 view.setDocumentText(text == null ? "" : text); 207 } 208 } 209 210 @Override 211 public Progress[] getProgress() { 212 return new Progress[] { new ProgressImpl(this.uriIndex, this.uris.size(), Progress.ENTITIES) }; 213 } 214 215 private Map<String, String> textByFileName(String tabFileName) throws IOException { 216 // get all variants of the file under all subdirectories 217 List<File> files = new ArrayList<File>(); 218 for (File dir : this.dataDirectories) { 219 files.addAll(getTempEvalFiles(dir, tabFileName)); 220 } 221 222 // map each file name to its lines 223 ListMultimap<String, String> fileLines = ArrayListMultimap.create(); 224 for (File file : files) { 225 for (String line : Files.readLines(file, Charsets.US_ASCII)) { 226 String fileName = getAnnotatedFileName(line); 227 fileLines.put(fileName, line); 228 } 229 } 230 231 // convert lists of lines back into text 232 Map<String, String> fileTexts = new HashMap<String, String>(); 233 for (String fileName : fileLines.keySet()) { 234 StringBuilder builder = new StringBuilder(); 235 for (String line : fileLines.get(fileName)) { 236 builder.append(line).append('\n'); 237 } 238 fileTexts.put(fileName, builder.toString()); 239 } 240 return fileTexts; 241 } 242 243 private static List<File> getTempEvalFiles(File dataDirectory, String tabFileName) 244 throws FileNotFoundException { 245 246 // subdirectory is "data" for training, and both "entities" and "relations" for testing 247 List<File> files = new ArrayList<File>(); 248 for (String subDir : Arrays.asList("data", "key")) { 249 files.add(new File(new File(dataDirectory, subDir), tabFileName)); 250 } 251 // weird special case: dct.txt is dct-en.txt in testing base directory 252 files.add(new File(dataDirectory, tabFileName.replaceAll("\\.txt", "-en.txt"))); 253 254 // filter existing files 255 List<File> existingFiles = new ArrayList<File>(); 256 for (File file : files) { 257 if (file.exists()) { 258 existingFiles.add(file); 259 } 260 } 261 262 // error if we didn't find at least one 263 if (existingFiles.size() == 0) { 264 throw new FileNotFoundException("Could not find any of " + files); 265 } 266 return existingFiles; 267 } 268 269 protected static String getAnnotatedFileName(String line) { 270 // the filename is the first column 271 String[] parts = line.split("\t", 2); 272 if (parts.length != 2) { 273 throw new IllegalArgumentException("Expected <filename>\t..., found " + line); 274 } 275 return parts[0]; 276 } 277 278 public static List<String> getAnnotatedFileNames(File dataDirectory) throws IOException { 279 // look for file names in all the base segmentation files 280 List<String> fileNames = new ArrayList<String>(); 281 Set<String> seenFileNames = new HashSet<String>(); 282 for (File tabFile : getTempEvalFiles(dataDirectory, "base-segmentation.tab")) { 283 for (String line : Files.readLines(tabFile, Charsets.US_ASCII)) { 284 285 // add the filename to the list if we haven't already seen it 286 String fileName = getAnnotatedFileName(line); 287 if (!seenFileNames.contains(fileName)) { 288 seenFileNames.add(fileName); 289 fileNames.add(fileName); 290 } 291 } 292 } 293 return fileNames; 294 } 295}