001/*
002 * Copyright (c) 2011, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.timeml;
025
026import java.io.File;
027import java.io.FileNotFoundException;
028import java.io.IOException;
029import java.net.URI;
030import java.net.URISyntaxException;
031import java.util.ArrayList;
032import java.util.Arrays;
033import java.util.HashMap;
034import java.util.HashSet;
035import java.util.List;
036import java.util.Map;
037import java.util.Set;
038
039import org.apache.uima.UimaContext;
040import org.apache.uima.cas.CASException;
041import org.apache.uima.collection.CollectionException;
042import org.apache.uima.collection.CollectionReader;
043import org.apache.uima.jcas.JCas;
044import org.apache.uima.resource.ResourceInitializationException;
045import org.apache.uima.util.Progress;
046import org.apache.uima.util.ProgressImpl;
047import org.cleartk.util.ViewUriUtil;
048import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
049import org.apache.uima.fit.descriptor.ConfigurationParameter;
050import org.apache.uima.fit.factory.CollectionReaderFactory;
051
052import com.google.common.base.Charsets;
053import com.google.common.collect.ArrayListMultimap;
054import com.google.common.collect.ListMultimap;
055import com.google.common.io.Files;
056
057/**
058 * <br>
059 * Copyright (c) 2011, Regents of the University of Colorado <br>
060 * All rights reserved.
061 * 
062 * @author Steven Bethard
063 */
064public class TempEval2010CollectionReader extends JCasCollectionReader_ImplBase {
065
066  public static final String BASE_SEGMENTATION_VIEW_NAME = "base-segmentation.tab";
067
068  public static final String DCT_VIEW_NAME = "dct.txt";
069
070  public static final String EVENT_EXTENTS_VIEW_NAME = "event-extents.tab";
071
072  public static final String EVENT_ATTRIBUTES_VIEW_NAME = "event-attributes.tab";
073
074  public static final String TIMEX_EXTENTS_VIEW_NAME = "timex-extents.tab";
075
076  public static final String TIMEX_ATTRIBUTES_VIEW_NAME = "timex-attributes.tab";
077
078  public static final String TLINK_DCT_EVENT_VIEW_NAME = "tlinks-dct-event.tab";
079
080  public static final String TLINK_MAIN_EVENTS_VIEW_NAME = "tlinks-main-events.tab";
081
082  public static final String TLINK_SUBORDINATED_EVENTS_VIEW_NAME = "tlinks-subordinated-events.tab";
083
084  public static final String TLINK_TIMEX_EVENT_VIEW_NAME = "tlinks-timex-event.tab";
085
086  public static CollectionReader getCollectionReader(String... dataPaths)
087      throws ResourceInitializationException {
088    List<File> dirs = new ArrayList<File>();
089    for (String path : dataPaths) {
090      dirs.add(new File(path));
091    }
092    return getCollectionReader(dirs);
093  }
094
095  public static CollectionReader getCollectionReader(List<File> dataDirectories)
096      throws ResourceInitializationException {
097    return getCollectionReader(dataDirectories, null);
098  }
099
100  public static CollectionReader getCollectionReader(
101      List<File> dataDirectories,
102      Set<String> selectedFileNames) throws ResourceInitializationException {
103    // workaround UimaFIT limitation
104    List<String> dirsList = new ArrayList<String>();
105    for (File dir : dataDirectories) {
106      dirsList.add(dir.getPath());
107    }
108    String[] dirs = dirsList.toArray(new String[dirsList.size()]);
109    String[] names = selectedFileNames == null
110        ? null
111        : selectedFileNames.toArray(new String[selectedFileNames.size()]);
112    return CollectionReaderFactory.createReader(
113        TempEval2010CollectionReader.class,
114        null,
115        PARAM_DATA_DIRECTORIES,
116        dirs,
117        PARAM_SELECTED_FILE_NAMES,
118        names);
119  }
120
121  @ConfigurationParameter(
122      name = PARAM_DATA_DIRECTORIES,
123      mandatory = true,
124      description = "The directories containing the TempEval "
125          + "2010 data, e.g. \"tempeval-training-2/english\" and \"tempeval2-test/english\"")
126  protected List<File> dataDirectories;
127
128  public static final String PARAM_DATA_DIRECTORIES = "dataDirectories";
129
130  @ConfigurationParameter(
131      name = PARAM_SELECTED_FILE_NAMES,
132      mandatory = false,
133      description = "The names of files that should be included when reading, "
134          + "e.g \"ABC19980108.1830.0711\". If null, then all files in the dataset will be included.")
135  protected Set<String> selectedFileNames;
136
137  public static final String PARAM_SELECTED_FILE_NAMES = "selectedFileNames";
138
139  protected List<URI> uris;
140
141  protected int uriIndex;
142
143  private Map<String, Map<String, String>> viewFileTexts;
144
145  @Override
146  public void initialize(UimaContext context) throws ResourceInitializationException {
147    super.initialize(context);
148
149    try {
150      // assemble URIs for each file in the data
151      this.uriIndex = 0;
152      this.uris = new ArrayList<URI>();
153      for (File dataDirectory : this.dataDirectories) {
154        URI dataURI = dataDirectory.toURI();
155        for (String fileName : getAnnotatedFileNames(dataDirectory)) {
156          if (this.selectedFileNames == null || this.selectedFileNames.contains(fileName)) {
157            URI uri = new URI(dataURI.getScheme(), dataURI.getHost(), dataURI.getPath(), fileName);
158            this.uris.add(uri);
159          }
160        }
161      }
162
163      // group lines by filename
164      this.viewFileTexts = new HashMap<String, Map<String, String>>();
165      for (String viewName : Arrays.asList(
166          BASE_SEGMENTATION_VIEW_NAME,
167          DCT_VIEW_NAME,
168          EVENT_EXTENTS_VIEW_NAME,
169          EVENT_ATTRIBUTES_VIEW_NAME,
170          TIMEX_EXTENTS_VIEW_NAME,
171          TIMEX_ATTRIBUTES_VIEW_NAME,
172          TLINK_DCT_EVENT_VIEW_NAME,
173          TLINK_MAIN_EVENTS_VIEW_NAME,
174          TLINK_SUBORDINATED_EVENTS_VIEW_NAME,
175          TLINK_TIMEX_EVENT_VIEW_NAME)) {
176        // assumes view names are the same as the .tab file names
177        this.viewFileTexts.put(viewName, this.textByFileName(viewName));
178      }
179    } catch (IOException e) {
180      throw new ResourceInitializationException(e);
181    } catch (URISyntaxException e) {
182      throw new ResourceInitializationException(e);
183    }
184  }
185
186  @Override
187  public boolean hasNext() throws IOException, CollectionException {
188    return this.uriIndex < this.uris.size();
189  }
190
191  @Override
192  public void getNext(JCas jCas) throws IOException, CollectionException {
193    URI uri = this.uris.get(this.uriIndex);
194    this.uriIndex += 1;
195    ViewUriUtil.setURI(jCas, uri);
196
197    String fileName = uri.getFragment();
198    for (String viewName : this.viewFileTexts.keySet()) {
199      JCas view;
200      try {
201        view = jCas.createView(viewName);
202      } catch (CASException e) {
203        throw new CollectionException(e);
204      }
205      String text = this.viewFileTexts.get(viewName).get(fileName);
206      view.setDocumentText(text == null ? "" : text);
207    }
208  }
209
210  @Override
211  public Progress[] getProgress() {
212    return new Progress[] { new ProgressImpl(this.uriIndex, this.uris.size(), Progress.ENTITIES) };
213  }
214
215  private Map<String, String> textByFileName(String tabFileName) throws IOException {
216    // get all variants of the file under all subdirectories
217    List<File> files = new ArrayList<File>();
218    for (File dir : this.dataDirectories) {
219      files.addAll(getTempEvalFiles(dir, tabFileName));
220    }
221
222    // map each file name to its lines
223    ListMultimap<String, String> fileLines = ArrayListMultimap.create();
224    for (File file : files) {
225      for (String line : Files.readLines(file, Charsets.US_ASCII)) {
226        String fileName = getAnnotatedFileName(line);
227        fileLines.put(fileName, line);
228      }
229    }
230
231    // convert lists of lines back into text
232    Map<String, String> fileTexts = new HashMap<String, String>();
233    for (String fileName : fileLines.keySet()) {
234      StringBuilder builder = new StringBuilder();
235      for (String line : fileLines.get(fileName)) {
236        builder.append(line).append('\n');
237      }
238      fileTexts.put(fileName, builder.toString());
239    }
240    return fileTexts;
241  }
242
243  private static List<File> getTempEvalFiles(File dataDirectory, String tabFileName)
244      throws FileNotFoundException {
245
246    // subdirectory is "data" for training, and both "entities" and "relations" for testing
247    List<File> files = new ArrayList<File>();
248    for (String subDir : Arrays.asList("data", "key")) {
249      files.add(new File(new File(dataDirectory, subDir), tabFileName));
250    }
251    // weird special case: dct.txt is dct-en.txt in testing base directory
252    files.add(new File(dataDirectory, tabFileName.replaceAll("\\.txt", "-en.txt")));
253
254    // filter existing files
255    List<File> existingFiles = new ArrayList<File>();
256    for (File file : files) {
257      if (file.exists()) {
258        existingFiles.add(file);
259      }
260    }
261
262    // error if we didn't find at least one
263    if (existingFiles.size() == 0) {
264      throw new FileNotFoundException("Could not find any of " + files);
265    }
266    return existingFiles;
267  }
268
269  protected static String getAnnotatedFileName(String line) {
270    // the filename is the first column
271    String[] parts = line.split("\t", 2);
272    if (parts.length != 2) {
273      throw new IllegalArgumentException("Expected <filename>\t..., found " + line);
274    }
275    return parts[0];
276  }
277
278  public static List<String> getAnnotatedFileNames(File dataDirectory) throws IOException {
279    // look for file names in all the base segmentation files
280    List<String> fileNames = new ArrayList<String>();
281    Set<String> seenFileNames = new HashSet<String>();
282    for (File tabFile : getTempEvalFiles(dataDirectory, "base-segmentation.tab")) {
283      for (String line : Files.readLines(tabFile, Charsets.US_ASCII)) {
284
285        // add the filename to the list if we haven't already seen it
286        String fileName = getAnnotatedFileName(line);
287        if (!seenFileNames.contains(fileName)) {
288          seenFileNames.add(fileName);
289          fileNames.add(fileName);
290        }
291      }
292    }
293    return fileNames;
294  }
295}