001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.conll2003; 025 026import java.io.BufferedReader; 027import java.io.File; 028import java.io.FileNotFoundException; 029import java.io.FileReader; 030import java.io.IOException; 031import java.net.URI; 032import java.net.URISyntaxException; 033import java.util.ArrayList; 034import java.util.List; 035 036import org.apache.uima.UimaContext; 037import org.apache.uima.collection.CollectionException; 038import org.apache.uima.jcas.JCas; 039import org.apache.uima.jcas.cas.FSArray; 040import org.apache.uima.jcas.tcas.Annotation; 041import org.apache.uima.resource.ResourceInitializationException; 042import org.apache.uima.util.Progress; 043import org.apache.uima.util.ProgressImpl; 044import org.cleartk.ne.type.Chunk; 045import org.cleartk.ne.type.NamedEntity; 046import org.cleartk.ne.type.NamedEntityMention; 047import org.cleartk.token.type.Sentence; 048import org.cleartk.token.type.Token; 049import org.cleartk.util.ViewUriUtil; 050import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 051import org.apache.uima.fit.descriptor.ConfigurationParameter; 052import org.apache.uima.fit.descriptor.SofaCapability; 053 054/** 055 * <br> 056 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 057 * All rights reserved. 058 * 059 * 060 * @author Philip Ogren 061 * 062 * 063 * This collection reader reads in the CoNLL 2003 named entity data. The data can be 064 * retrieved from http://www.cnts.ua.ac.be/conll2003/ner/ 065 * 066 */ 067@SofaCapability(outputSofas = ViewUriUtil.URI) 068public class Conll2003GoldReader extends JCasCollectionReader_ImplBase { 069 070 public static final String PARAM_DATA_FILE_NAME = "dataFileName"; 071 072 @ConfigurationParameter( 073 name = PARAM_DATA_FILE_NAME, 074 mandatory = true, description = "Points to CoNLL data (e.g. ner/eng.train).") 075 private String dataFileName; 076 077 public static final String PARAM_LOAD_NAMED_ENTITIES = "loadNamedEntities"; 078 079 @ConfigurationParameter( 080 name = PARAM_LOAD_NAMED_ENTITIES, 081 mandatory = true, description = "determines if the named entities are loaded (i.e. named entity mention annotations are created) or if just plain text from the files is loaded.", defaultValue = "true") 082 private boolean loadNamedEntities; 083 084 public static final String DOCSTART = "-DOCSTART-"; 085 086 BufferedReader reader; 087 088 boolean hasNext = true; 089 090 int documentIndex = 0; 091 092 int entityIdIndex = 0; 093 094 public void initialize(UimaContext context) throws ResourceInitializationException { 095 096 try { 097 File conllFile = new File(dataFileName); 098 reader = new BufferedReader(new FileReader(conllFile)); 099 // advance the reader past the first occurrence of a document start and 100 // blank line. 101 String line; 102 while ((line = reader.readLine()) != null) { 103 if (line.trim().startsWith(DOCSTART)) { 104 reader.readLine(); // read the blank line 105 break; 106 } 107 } 108 109 sentenceTokens = new ArrayList<Token>(); 110 sentenceChunks = new ArrayList<Chunk>(); 111 chunkTokens = new ArrayList<Token>(); 112 namedEntityTokens = new ArrayList<Token>(); 113 } catch (FileNotFoundException fnfe) { 114 throw new ResourceInitializationException(fnfe); 115 } catch (IOException ioe) { 116 throw new ResourceInitializationException(ioe); 117 } 118 } 119 120 List<String> documentData; // contains every line from for the current document 121 122 StringBuffer documentText; // collects the text of the tokens and then we set the document text 123 // with contents 124 125 int sentenceStart; // the start (character offset) of the current sentence 126 127 List<Token> sentenceTokens; // the tokens for the current sentence 128 129 List<Chunk> sentenceChunks; // the chunks for the current sentence 130 131 int tokenPosition; // the index of the current token 132 133 int chunkStart; // the start (char offset) of the current chunk 134 135 String currentChunkType; // the type of the current chunk (including the I- or B- prefix) 136 137 List<Token> chunkTokens; // the tokens for the current chunk 138 139 int namedEntityStart; // the start (char offset) of the current named entity 140 141 String currentNamedEntityType; // the type of the current named entity (including the I- or B- 142 // prefix) 143 144 List<Token> namedEntityTokens; // the tokens for the current named entity 145 146 public void getNext(JCas jCas) throws IOException, CollectionException { 147 // read in the data for the next document from the reader 148 documentData = new ArrayList<String>(); 149 String line; 150 while ((line = reader.readLine()) != null && !line.startsWith(DOCSTART)) { 151 documentData.add(line.trim()); 152 } 153 154 if (line == null) 155 hasNext = false; 156 else 157 line = reader.readLine().trim(); // advance past the blank line that follows 158 // "-DOCSTART- -X- O O" 159 // (we don't want an empty sentence on the front end!) 160 documentText = new StringBuffer(); 161 162 initSentence(); 163 tokenPosition = 0; 164 chunkStart = 0; 165 currentChunkType = ""; 166 chunkTokens.clear(); 167 namedEntityStart = 0; 168 currentNamedEntityType = ""; 169 namedEntityTokens.clear(); 170 171 for (String dataLine : documentData) { 172 if (dataLine.trim().equals("")) { 173 createChunk(jCas); 174 currentChunkType = ""; 175 createNamedEntity(jCas); 176 currentNamedEntityType = ""; 177 178 Sentence sentence = new Sentence(jCas, sentenceStart, documentText.length()); 179 sentence.addToIndexes(); 180 181 initSentence(); 182 } else { 183 String[] dataPieces = dataLine.split(" "); 184 String tok = dataPieces[0]; 185 String pos = dataPieces[1]; 186 String chunkType = dataPieces[2]; 187 if (currentChunkType.equals("")) 188 initChunk(chunkType); 189 String namedEntityType = dataPieces[3]; 190 if (currentNamedEntityType.equals("")) 191 initNamedEntity(namedEntityType); 192 193 Token token = new Token(jCas, documentText.length(), documentText.length() + tok.length()); 194 token.setPos(pos); 195 token.addToIndexes(); 196 197 boolean chunkStartsWithB = startsWithB(currentChunkType, chunkType); 198 if (!chunkType.equals(currentChunkType) && !chunkStartsWithB) { 199 createChunk(jCas); 200 initChunk(chunkType); 201 } 202 203 boolean namedEntityStartsWithB = startsWithB(currentNamedEntityType, namedEntityType); 204 205 if (!namedEntityType.equals(currentNamedEntityType) && !namedEntityStartsWithB) { 206 createNamedEntity(jCas); 207 initNamedEntity(namedEntityType); 208 } 209 210 sentenceTokens.add(token); 211 chunkTokens.add(token); 212 namedEntityTokens.add(token); 213 documentText.append(tok + " "); 214 } 215 } 216 217 jCas.setDocumentText(documentText.toString()); 218 219 URI fileURI = new File(dataFileName).toURI(); 220 String fragment = String.valueOf(documentIndex); 221 URI uri; 222 try { 223 uri = new URI(fileURI.getScheme(), fileURI.getHost(), fileURI.getPath(), fragment); 224 } catch (URISyntaxException e) { 225 // should never reach this; fragment should always be valid since it's just a number 226 throw new RuntimeException(e); 227 } 228 ViewUriUtil.setURI(jCas, uri); 229 ++documentIndex; 230 231 } 232 233 private void initSentence() { 234 sentenceStart = documentText.length(); 235 sentenceTokens.clear(); 236 sentenceChunks.clear(); 237 } 238 239 private void createChunk(JCas jCas) { 240 if (!currentChunkType.equals("O")) { 241 Chunk chunk = new Chunk(jCas, chunkStart, documentText.length() - 1); 242 chunk.setChunkType(currentChunkType.substring(2)); 243 chunk.addToIndexes(); 244 sentenceChunks.add(chunk); 245 } 246 } 247 248 private void initChunk(String chunkType) { 249 chunkStart = documentText.length(); 250 chunkTokens.clear(); 251 currentChunkType = chunkType; 252 } 253 254 private void createNamedEntity(JCas jCas) { 255 if (!currentNamedEntityType.equals("O") && loadNamedEntities) { 256 NamedEntity ne = new NamedEntity(jCas); 257 ne.setEntityClass("SPC"); 258 ne.setEntityId("" + entityIdIndex++); 259 ne.setEntityType(currentNamedEntityType.substring(2)); 260 ne.setEntitySubtype(currentNamedEntityType.substring(2)); 261 ne.addToIndexes(); 262 263 NamedEntityMention nem = new NamedEntityMention( 264 jCas, 265 namedEntityStart, 266 documentText.length() - 1); 267 nem.setMentionType("NAM"); 268 Annotation annotation = new Annotation(jCas, namedEntityStart, documentText.length() - 1); 269 annotation.addToIndexes(); 270 // Chunk chunk = new Chunk(jCas, namedEntityStart, documentText.length()-1); 271 // chunk.setTokens(UIMAUtil.toFSArray(jCas, namedEntityTokens)); 272 // chunk.setChunkType("CoNLL NEM annotation"); 273 // chunk.addToIndexes(); 274 nem.setAnnotation(annotation); 275 nem.setHead(annotation); 276 nem.setMentionedEntity(ne); 277 nem.addToIndexes(); 278 279 ne.setMentions(new FSArray(jCas, 1)); 280 ne.setMentions(0, nem); 281 } 282 } 283 284 private void initNamedEntity(String namedEntityType) { 285 namedEntityStart = documentText.length(); 286 namedEntityTokens.clear(); 287 currentNamedEntityType = namedEntityType; 288 } 289 290 /** 291 * Determines if the read type is the same as the current type with the only difference being that 292 * the current type (bType) starts with "B-" and the read type starts with "I-". 293 * 294 * @param bType 295 * - the current type for the chunk or named entity 296 * @param iType 297 * - the read chunk or named entity type for the token being examined. 298 * @return true if we should consder iType to be the same as bType so we know not to make a new 299 * chunk or named entity. 300 */ 301 private boolean startsWithB(String bType, String iType) { 302 if (bType.startsWith("B") && iType.startsWith("I") 303 && iType.substring(1).equals(bType.substring(1))) { 304 return true; 305 } 306 return false; 307 } 308 309 public void close() throws IOException { 310 reader.close(); 311 } 312 313 public Progress[] getProgress() { 314 return new Progress[] { new ProgressImpl(documentIndex, 5000, Progress.ENTITIES) }; 315 } 316 317 public boolean hasNext() throws IOException, CollectionException { 318 return hasNext; 319 } 320 321 public void setDataFileName(String dataFileName) { 322 this.dataFileName = dataFileName; 323 } 324 325 public void setLoadNamedEntities(boolean loadNamedEntities) { 326 this.loadNamedEntities = loadNamedEntities; 327 } 328 329}