001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.ace2005; 025 026import java.io.BufferedReader; 027import java.io.File; 028import java.io.FileReader; 029import java.io.IOException; 030import java.util.ArrayList; 031import java.util.List; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034 035import org.apache.uima.UimaContext; 036import org.apache.uima.cas.CAS; 037import org.apache.uima.cas.CASException; 038import org.apache.uima.collection.CollectionException; 039import org.apache.uima.jcas.JCas; 040import org.apache.uima.resource.ResourceInitializationException; 041import org.apache.uima.util.FileUtils; 042import org.apache.uima.util.Progress; 043import org.apache.uima.util.ProgressImpl; 044import org.cleartk.ne.type.Ace2005Document; 045import org.cleartk.util.ViewUriUtil; 046import org.jdom2.Document; 047import org.jdom2.Element; 048import org.jdom2.JDOMException; 049import org.jdom2.input.SAXBuilder; 050import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 051import org.apache.uima.fit.descriptor.ConfigurationParameter; 052import org.apache.uima.fit.descriptor.SofaCapability; 053 054/** 055 * <br> 056 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 057 * All rights reserved. 058 * 059 * 060 * @author Philip Ogren 061 * 062 */ 063 064@SofaCapability(outputSofas = { Ace2005Constants.ACE_2005_APF_URI_VIEW, ViewUriUtil.URI }) 065public class Ace2005GoldReader extends JCasCollectionReader_ImplBase { 066 public static final String PARAM_ACE_DIRECTORY_NAME = "aceDirectoryName"; 067 068 @ConfigurationParameter( 069 name = PARAM_ACE_DIRECTORY_NAME, 070 mandatory = true, 071 description = "Takes the name of directory that contains ACE data. Typically, a folder such as \".../ACE_2005/optimization/English/all\". The folder should contain files that come in pairs - i.e. for each .sgm file there should be a corresponding .apf.xml file.") 072 private String aceDirectoryName; 073 074 private static final String PARAM_ACE_FILE_NAMES_DESCRIPTION = "takes a file that contains the names of the files to read. \n" 075 + "The file should contain a list of the files in AceCorpusDir (one file name per line) \n" 076 + "that you want read in. File names should not include the last suffix(es) (e.g. \".sgm\" or \"apf.xml\") \n" 077 + "If parameter value is not given, then all files will be read in. An example file might look like this: \n\n" 078 + "AFP_ENG_20030304.0250\n" + "AFP_ENG_20030305.0918\n" + "...\n"; 079 080 public static final String PARAM_ACE_FILE_NAMES_FILE = "aceFileNamesFile"; 081 082 @ConfigurationParameter( 083 name = PARAM_ACE_FILE_NAMES_FILE, 084 description = PARAM_ACE_FILE_NAMES_DESCRIPTION) 085 private String aceFileNamesFile; 086 087 File[] aceFiles; 088 089 int aceFileIndex; 090 091 int aceFileCount; 092 093 File currentSGMFile = null; 094 095 public static final String TAG_REGEX = "<.*?>"; 096 097 Pattern tagPattern; 098 099 public void initialize(UimaContext context) throws ResourceInitializationException { 100 101 if (!new File(aceDirectoryName).exists()) { 102 throw new ResourceInitializationException(new IOException(String.format( 103 "directory %s does not exist", 104 aceDirectoryName))); 105 } 106 File aceDirectory = new File(aceDirectoryName); 107 108 if (aceFileNamesFile != null && !aceFileNamesFile.trim().equals("")) { 109 try { 110 List<File> files = new ArrayList<File>(); 111 BufferedReader reader = new BufferedReader(new FileReader(aceFileNamesFile)); 112 String line; 113 try { 114 while ((line = reader.readLine()) != null) { 115 line = line.trim(); 116 if (line.endsWith(".sgm")) 117 files.add(new File(aceDirectory, line)); 118 else 119 files.add(new File(aceDirectory, line + ".sgm")); 120 } 121 } finally { 122 reader.close(); 123 } 124 aceFiles = files.toArray(new File[files.size()]); 125 } catch (IOException ioe) { 126 throw new ResourceInitializationException(ioe); 127 } 128 for (File file : aceFiles) { 129 if (!file.exists()) 130 throw new ResourceInitializationException( 131 ResourceInitializationException.COULD_NOT_ACCESS_DATA, 132 new Object[] { file }); 133 } 134 } else { 135 aceFiles = aceDirectory.listFiles(); 136 } 137 aceFileIndex = 0; 138 aceFileCount = 0; 139 140 tagPattern = Pattern.compile(TAG_REGEX, Pattern.MULTILINE | Pattern.DOTALL); 141 } 142 143 private File getNextSGMFile() { 144 if (currentSGMFile != null) 145 return currentSGMFile; 146 while (aceFileIndex < aceFiles.length) { 147 File sgmFile = aceFiles[aceFileIndex++]; 148 if (sgmFile.getName().endsWith(".sgm")) { 149 currentSGMFile = sgmFile; 150 return sgmFile; 151 } 152 } 153 return null; 154 } 155 156 private File getAPFFile(File sgmFile) { 157 String apfFileName = sgmFile.getPath(); 158 apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "apf.xml"; 159 if (new File(apfFileName).exists()) 160 return new File(apfFileName); 161 162 apfFileName = sgmFile.getPath(); 163 apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "entities.apf.xml"; 164 if (new File(apfFileName).exists()) 165 return new File(apfFileName); 166 167 apfFileName = sgmFile.getPath(); 168 apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "mentions.apf.xml"; 169 if (new File(apfFileName).exists()) 170 return new File(apfFileName); 171 172 return null; 173 } 174 175 private String getDocumentText(String sgmText) { 176 StringBuffer rawDocumentText = new StringBuffer(sgmText); 177 Matcher tagMatcher = tagPattern.matcher(rawDocumentText); 178 String documentText = tagMatcher.replaceAll(""); 179 return documentText; 180 } 181 182 // make note about moving local dtd file into directory 183 public void getNext(JCas jCas) throws IOException, CollectionException { 184 try { 185 // we need the next sgm file which will typically be 'currentSGMFile' - but we 186 // will call getNextSGMFile() to be safe 187 File sgmFile = getNextSGMFile(); 188 // setting currentSGMFile to null tells getNextSGMFile to get the next sgm file 189 // rather than simply returning the current value. 190 currentSGMFile = null; 191 192 String sgmText = FileUtils.file2String(sgmFile); 193 194 JCas initialView = jCas.getView(CAS.NAME_DEFAULT_SOFA); 195 initialView.setDocumentText(getDocumentText(sgmText)); 196 197 // org.cleartk.type.Document sgmDocument = new org.cleartk.type.Document(initialView); 198 // sgmDocument.setIdentifier(sgmFile.getName()); 199 // sgmDocument.setPath(sgmFile.getName()); 200 // sgmDocument.addToIndexes(); 201 202 File apfFile = getAPFFile(sgmFile); 203 204 SAXBuilder builder = new SAXBuilder(); 205 builder.setDTDHandler(null); 206 Document doc = builder.build(apfFile); 207 208 Element apfSource = doc.getRootElement(); 209 String uri = apfSource.getAttributeValue("URI"); 210 String source = apfSource.getAttributeValue("SOURCE"); 211 String type = apfSource.getAttributeValue("TYPE"); 212 213 ViewUriUtil.setURI(jCas, sgmFile.toURI()); 214 Ace2005Document document = new Ace2005Document(initialView); 215 document.setAceUri(uri); 216 document.setAceSource(source); 217 document.setAceType(type); 218 document.addToIndexes(); 219 220 JCas apfUriView = jCas.createView(Ace2005Constants.ACE_2005_APF_URI_VIEW); 221 apfUriView.setSofaDataURI(apfFile.toURI().toString(), null); 222 223 } catch (CASException ce) { 224 throw new CollectionException(ce); 225 } catch (JDOMException je) { 226 throw new CollectionException(je); 227 } 228 } 229 230 public void close() throws IOException { 231 // TODO Auto-generated method stub 232 233 } 234 235 /** 236 * Progress is measured by the number of files in the target directory - not by the number of 237 * times getNext has been (and will be) called. This means that the total number of entities to 238 * completion is typically going to be 2 or 4 times as many 'documents' that are found depending 239 * on what kinds of files exist in the target directory (e.g. *.ag.xml, *.apf.xml, *.sgm, *.tab) 240 */ 241 public Progress[] getProgress() { 242 return new Progress[] { new ProgressImpl(aceFileIndex, aceFiles.length, Progress.ENTITIES) }; 243 } 244 245 public boolean hasNext() throws IOException, CollectionException { 246 return getNextSGMFile() != null; 247 } 248 249 public void setAceDirectoryName(String aceDirectoryName) { 250 this.aceDirectoryName = aceDirectoryName; 251 } 252 253 public void setAceFileNamesFile(String aceFileNamesFile) { 254 this.aceFileNamesFile = aceFileNamesFile; 255 } 256 257}