001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.conll2005; 025 026import java.io.BufferedReader; 027import java.io.File; 028import java.io.FileInputStream; 029import java.io.IOException; 030import java.io.InputStream; 031import java.io.InputStreamReader; 032import java.net.URI; 033import java.net.URISyntaxException; 034import java.util.zip.GZIPInputStream; 035 036import org.apache.uima.UimaContext; 037import org.apache.uima.cas.CASException; 038import org.apache.uima.collection.CollectionException; 039import org.apache.uima.collection.CollectionReader; 040import org.apache.uima.jcas.JCas; 041import org.apache.uima.resource.ResourceInitializationException; 042import org.apache.uima.util.Progress; 043import org.apache.uima.util.ProgressImpl; 044import org.cleartk.util.ViewUriUtil; 045import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 046import org.apache.uima.fit.descriptor.ConfigurationParameter; 047import org.apache.uima.fit.descriptor.SofaCapability; 048import org.apache.uima.fit.factory.CollectionReaderFactory; 049 050/** 051 * <br> 052 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 053 * All rights reserved. 054 */ 055@SofaCapability(outputSofas = { Conll2005Constants.CONLL_2005_VIEW, ViewUriUtil.URI }) 056public class Conll2005GoldReader extends JCasCollectionReader_ImplBase { 057 058 public static CollectionReader getCollectionReader(String conll2005DataFile) 059 throws ResourceInitializationException { 060 return CollectionReaderFactory.createReader( 061 Conll2005GoldReader.class, 062 PARAM_CONLL2005_DATA_FILE, 063 conll2005DataFile); 064 } 065 066 @ConfigurationParameter(name = PARAM_CONLL2005_DATA_FILE, mandatory = true, description = "the path of the CoNLL 2005 data file") 067 private File conll2005DataFile; 068 069 public static final String PARAM_CONLL2005_DATA_FILE = "conll2005DataFile"; 070 071 private BufferedReader reader; 072 073 private boolean finished = false; 074 075 private int documentNumber; 076 077 private int totalDocuments; 078 079 @Override 080 public void initialize(UimaContext context) throws ResourceInitializationException { 081 try { 082 this.reader = this.getBufferedReader(); 083 String line; 084 this.totalDocuments = 0; 085 do { 086 line = this.reader.readLine(); 087 while (line != null && line.trim().length() == 0) { 088 line = this.reader.readLine(); 089 } 090 if (line == null) { 091 break; 092 } 093 this.totalDocuments += 1; 094 while (line != null && line.trim().length() > 0) { 095 line = this.reader.readLine(); 096 } 097 } while (line != null); 098 this.reader.close(); 099 100 this.reader = this.getBufferedReader(); 101 documentNumber = 0; 102 103 } catch (IOException e) { 104 throw new ResourceInitializationException(e); 105 } 106 } 107 108 private BufferedReader getBufferedReader() throws IOException { 109 InputStream in; 110 if (this.conll2005DataFile.getName().endsWith(".gz")) { 111 in = new GZIPInputStream(new FileInputStream(this.conll2005DataFile)); 112 } else { 113 in = new FileInputStream(this.conll2005DataFile); 114 } 115 return new BufferedReader(new InputStreamReader(in)); 116 } 117 118 public void getNext(JCas jCas) throws IOException, CollectionException { 119 try { 120 JCas conllView = jCas.createView(Conll2005Constants.CONLL_2005_VIEW); 121 122 String lineBuffer; 123 StringBuffer docBuffer = new StringBuffer(); 124 125 lineBuffer = reader.readLine(); 126 while (lineBuffer != null && lineBuffer.trim().length() == 0) { 127 lineBuffer = reader.readLine(); 128 } 129 130 if (lineBuffer == null) { 131 throw new CollectionException("unexpected end of input", null); 132 } 133 134 while (lineBuffer != null && lineBuffer.trim().length() != 0) { 135 docBuffer.append(lineBuffer.trim()); 136 docBuffer.append("\n"); 137 lineBuffer = reader.readLine(); 138 } 139 140 documentNumber += 1; 141 142 if (documentNumber == totalDocuments) { 143 finished = true; 144 } 145 146 conllView.setSofaDataString(docBuffer.toString(), "text/plain"); 147 URI fileURI = this.conll2005DataFile.toURI(); 148 String fragment = String.valueOf(this.documentNumber); 149 URI uri; 150 try { 151 uri = new URI(fileURI.getScheme(), fileURI.getHost(), fileURI.getPath(), fragment); 152 } catch (URISyntaxException e) { 153 // should never reach this; fragment should always be valid since it's just a number 154 throw new RuntimeException(e); 155 } 156 ViewUriUtil.setURI(jCas, uri); 157 } catch (CASException e) { 158 throw new CollectionException(e); 159 } 160 } 161 162 public void close() throws IOException { 163 reader.close(); 164 } 165 166 public Progress[] getProgress() { 167 return new Progress[] { new ProgressImpl(documentNumber, totalDocuments, Progress.ENTITIES) }; 168 } 169 170 public boolean hasNext() throws IOException, CollectionException { 171 return !finished; 172 } 173}