001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.ace2005;
025
026import java.io.BufferedReader;
027import java.io.File;
028import java.io.FileReader;
029import java.io.IOException;
030import java.util.ArrayList;
031import java.util.List;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034
035import org.apache.uima.UimaContext;
036import org.apache.uima.cas.CAS;
037import org.apache.uima.cas.CASException;
038import org.apache.uima.collection.CollectionException;
039import org.apache.uima.jcas.JCas;
040import org.apache.uima.resource.ResourceInitializationException;
041import org.apache.uima.util.FileUtils;
042import org.apache.uima.util.Progress;
043import org.apache.uima.util.ProgressImpl;
044import org.cleartk.ne.type.Ace2005Document;
045import org.cleartk.util.ViewUriUtil;
046import org.jdom2.Document;
047import org.jdom2.Element;
048import org.jdom2.JDOMException;
049import org.jdom2.input.SAXBuilder;
050import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
051import org.apache.uima.fit.descriptor.ConfigurationParameter;
052import org.apache.uima.fit.descriptor.SofaCapability;
053
054/**
055 * <br>
056 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
057 * All rights reserved.
058 * 
059 * 
060 * @author Philip Ogren
061 * 
062 */
063
064@SofaCapability(outputSofas = { Ace2005Constants.ACE_2005_APF_URI_VIEW, ViewUriUtil.URI })
065public class Ace2005GoldReader extends JCasCollectionReader_ImplBase {
066  public static final String PARAM_ACE_DIRECTORY_NAME = "aceDirectoryName";
067
068  @ConfigurationParameter(
069      name = PARAM_ACE_DIRECTORY_NAME,
070      mandatory = true,
071      description = "Takes the name of directory that contains ACE data.  Typically, a folder such as \".../ACE_2005/optimization/English/all\".  The folder should contain files that come in pairs - i.e. for each .sgm file there should be a corresponding .apf.xml file.")
072  private String aceDirectoryName;
073
074  private static final String PARAM_ACE_FILE_NAMES_DESCRIPTION = "takes a file that contains the names of the files to read.   \n"
075      + "The file should contain a list of the files in AceCorpusDir (one file name per line) \n"
076      + "that you want read in. File names should not include the last suffix(es) (e.g. \".sgm\" or \"apf.xml\") \n"
077      + "If parameter value is not given, then all files will be read in. An example file might look like this: \n\n"
078      + "AFP_ENG_20030304.0250\n" + "AFP_ENG_20030305.0918\n" + "...\n";
079
080  public static final String PARAM_ACE_FILE_NAMES_FILE = "aceFileNamesFile";
081
082  @ConfigurationParameter(
083      name = PARAM_ACE_FILE_NAMES_FILE,
084      description = PARAM_ACE_FILE_NAMES_DESCRIPTION)
085  private String aceFileNamesFile;
086
087  File[] aceFiles;
088
089  int aceFileIndex;
090
091  int aceFileCount;
092
093  File currentSGMFile = null;
094
095  public static final String TAG_REGEX = "<.*?>";
096
097  Pattern tagPattern;
098
099  public void initialize(UimaContext context) throws ResourceInitializationException {
100
101    if (!new File(aceDirectoryName).exists()) {
102      throw new ResourceInitializationException(new IOException(String.format(
103          "directory %s does not exist",
104          aceDirectoryName)));
105    }
106    File aceDirectory = new File(aceDirectoryName);
107
108    if (aceFileNamesFile != null && !aceFileNamesFile.trim().equals("")) {
109      try {
110        List<File> files = new ArrayList<File>();
111        BufferedReader reader = new BufferedReader(new FileReader(aceFileNamesFile));
112        String line;
113        try {
114          while ((line = reader.readLine()) != null) {
115            line = line.trim();
116            if (line.endsWith(".sgm"))
117              files.add(new File(aceDirectory, line));
118            else
119              files.add(new File(aceDirectory, line + ".sgm"));
120          }
121        } finally {
122          reader.close();
123        }
124        aceFiles = files.toArray(new File[files.size()]);
125      } catch (IOException ioe) {
126        throw new ResourceInitializationException(ioe);
127      }
128      for (File file : aceFiles) {
129        if (!file.exists())
130          throw new ResourceInitializationException(
131              ResourceInitializationException.COULD_NOT_ACCESS_DATA,
132              new Object[] { file });
133      }
134    } else {
135      aceFiles = aceDirectory.listFiles();
136    }
137    aceFileIndex = 0;
138    aceFileCount = 0;
139
140    tagPattern = Pattern.compile(TAG_REGEX, Pattern.MULTILINE | Pattern.DOTALL);
141  }
142
143  private File getNextSGMFile() {
144    if (currentSGMFile != null)
145      return currentSGMFile;
146    while (aceFileIndex < aceFiles.length) {
147      File sgmFile = aceFiles[aceFileIndex++];
148      if (sgmFile.getName().endsWith(".sgm")) {
149        currentSGMFile = sgmFile;
150        return sgmFile;
151      }
152    }
153    return null;
154  }
155
156  private File getAPFFile(File sgmFile) {
157    String apfFileName = sgmFile.getPath();
158    apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "apf.xml";
159    if (new File(apfFileName).exists())
160      return new File(apfFileName);
161
162    apfFileName = sgmFile.getPath();
163    apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "entities.apf.xml";
164    if (new File(apfFileName).exists())
165      return new File(apfFileName);
166
167    apfFileName = sgmFile.getPath();
168    apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "mentions.apf.xml";
169    if (new File(apfFileName).exists())
170      return new File(apfFileName);
171
172    return null;
173  }
174
175  private String getDocumentText(String sgmText) {
176    StringBuffer rawDocumentText = new StringBuffer(sgmText);
177    Matcher tagMatcher = tagPattern.matcher(rawDocumentText);
178    String documentText = tagMatcher.replaceAll("");
179    return documentText;
180  }
181
182  // make note about moving local dtd file into directory
183  public void getNext(JCas jCas) throws IOException, CollectionException {
184    try {
185      // we need the next sgm file which will typically be 'currentSGMFile' - but we
186      // will call getNextSGMFile() to be safe
187      File sgmFile = getNextSGMFile();
188      // setting currentSGMFile to null tells getNextSGMFile to get the next sgm file
189      // rather than simply returning the current value.
190      currentSGMFile = null;
191
192      String sgmText = FileUtils.file2String(sgmFile);
193
194      JCas initialView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
195      initialView.setDocumentText(getDocumentText(sgmText));
196
197      // org.cleartk.type.Document sgmDocument = new org.cleartk.type.Document(initialView);
198      // sgmDocument.setIdentifier(sgmFile.getName());
199      // sgmDocument.setPath(sgmFile.getName());
200      // sgmDocument.addToIndexes();
201
202      File apfFile = getAPFFile(sgmFile);
203
204      SAXBuilder builder = new SAXBuilder();
205      builder.setDTDHandler(null);
206      Document doc = builder.build(apfFile);
207
208      Element apfSource = doc.getRootElement();
209      String uri = apfSource.getAttributeValue("URI");
210      String source = apfSource.getAttributeValue("SOURCE");
211      String type = apfSource.getAttributeValue("TYPE");
212
213      ViewUriUtil.setURI(jCas, sgmFile.toURI());
214      Ace2005Document document = new Ace2005Document(initialView);
215      document.setAceUri(uri);
216      document.setAceSource(source);
217      document.setAceType(type);
218      document.addToIndexes();
219
220      JCas apfUriView = jCas.createView(Ace2005Constants.ACE_2005_APF_URI_VIEW);
221      apfUriView.setSofaDataURI(apfFile.toURI().toString(), null);
222
223    } catch (CASException ce) {
224      throw new CollectionException(ce);
225    } catch (JDOMException je) {
226      throw new CollectionException(je);
227    }
228  }
229
230  public void close() throws IOException {
231    // TODO Auto-generated method stub
232
233  }
234
235  /**
236   * Progress is measured by the number of files in the target directory - not by the number of
237   * times getNext has been (and will be) called. This means that the total number of entities to
238   * completion is typically going to be 2 or 4 times as many 'documents' that are found depending
239   * on what kinds of files exist in the target directory (e.g. *.ag.xml, *.apf.xml, *.sgm, *.tab)
240   */
241  public Progress[] getProgress() {
242    return new Progress[] { new ProgressImpl(aceFileIndex, aceFiles.length, Progress.ENTITIES) };
243  }
244
245  public boolean hasNext() throws IOException, CollectionException {
246    return getNextSGMFile() != null;
247  }
248
249  public void setAceDirectoryName(String aceDirectoryName) {
250    this.aceDirectoryName = aceDirectoryName;
251  }
252
253  public void setAceFileNamesFile(String aceFileNamesFile) {
254    this.aceFileNamesFile = aceFileNamesFile;
255  }
256
257}