001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.conll2003;
025
026import java.io.BufferedReader;
027import java.io.File;
028import java.io.FileNotFoundException;
029import java.io.FileReader;
030import java.io.IOException;
031import java.net.URI;
032import java.net.URISyntaxException;
033import java.util.ArrayList;
034import java.util.List;
035
036import org.apache.uima.UimaContext;
037import org.apache.uima.collection.CollectionException;
038import org.apache.uima.jcas.JCas;
039import org.apache.uima.jcas.cas.FSArray;
040import org.apache.uima.jcas.tcas.Annotation;
041import org.apache.uima.resource.ResourceInitializationException;
042import org.apache.uima.util.Progress;
043import org.apache.uima.util.ProgressImpl;
044import org.cleartk.ne.type.Chunk;
045import org.cleartk.ne.type.NamedEntity;
046import org.cleartk.ne.type.NamedEntityMention;
047import org.cleartk.token.type.Sentence;
048import org.cleartk.token.type.Token;
049import org.cleartk.util.ViewUriUtil;
050import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
051import org.apache.uima.fit.descriptor.ConfigurationParameter;
052import org.apache.uima.fit.descriptor.SofaCapability;
053
054/**
055 * <br>
056 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
057 * All rights reserved.
058 * 
059 * 
060 * @author Philip Ogren
061 * 
062 * 
063 *         This collection reader reads in the CoNLL 2003 named entity data. The data can be
064 *         retrieved from http://www.cnts.ua.ac.be/conll2003/ner/
065 * 
066 */
067@SofaCapability(outputSofas = ViewUriUtil.URI)
068public class Conll2003GoldReader extends JCasCollectionReader_ImplBase {
069  
070  public static final String PARAM_DATA_FILE_NAME = "dataFileName";
071
072  @ConfigurationParameter(
073      name = PARAM_DATA_FILE_NAME,
074      mandatory = true, description = "Points to CoNLL data (e.g. ner/eng.train).")
075  private String dataFileName;
076
077  public static final String PARAM_LOAD_NAMED_ENTITIES = "loadNamedEntities";
078
079  @ConfigurationParameter(
080      name = PARAM_LOAD_NAMED_ENTITIES,
081      mandatory = true, description = "determines if the named entities are loaded (i.e. named entity mention annotations are created) or if just plain text from the files is loaded.", defaultValue = "true")
082  private boolean loadNamedEntities;
083
084  public static final String DOCSTART = "-DOCSTART-";
085
086  BufferedReader reader;
087
088  boolean hasNext = true;
089
090  int documentIndex = 0;
091
092  int entityIdIndex = 0;
093
094  public void initialize(UimaContext context) throws ResourceInitializationException {
095
096    try {
097      File conllFile = new File(dataFileName);
098      reader = new BufferedReader(new FileReader(conllFile));
099      // advance the reader past the first occurrence of a document start and
100      // blank line.
101      String line;
102      while ((line = reader.readLine()) != null) {
103        if (line.trim().startsWith(DOCSTART)) {
104          reader.readLine(); // read the blank line
105          break;
106        }
107      }
108
109      sentenceTokens = new ArrayList<Token>();
110      sentenceChunks = new ArrayList<Chunk>();
111      chunkTokens = new ArrayList<Token>();
112      namedEntityTokens = new ArrayList<Token>();
113    } catch (FileNotFoundException fnfe) {
114      throw new ResourceInitializationException(fnfe);
115    } catch (IOException ioe) {
116      throw new ResourceInitializationException(ioe);
117    }
118  }
119
120  List<String> documentData; // contains every line from for the current document
121
122  StringBuffer documentText; // collects the text of the tokens and then we set the document text
123                             // with contents
124
125  int sentenceStart; // the start (character offset) of the current sentence
126
127  List<Token> sentenceTokens; // the tokens for the current sentence
128
129  List<Chunk> sentenceChunks; // the chunks for the current sentence
130
131  int tokenPosition; // the index of the current token
132
133  int chunkStart; // the start (char offset) of the current chunk
134
135  String currentChunkType; // the type of the current chunk (including the I- or B- prefix)
136
137  List<Token> chunkTokens; // the tokens for the current chunk
138
139  int namedEntityStart; // the start (char offset) of the current named entity
140
141  String currentNamedEntityType; // the type of the current named entity (including the I- or B-
142                                 // prefix)
143
144  List<Token> namedEntityTokens; // the tokens for the current named entity
145
146  public void getNext(JCas jCas) throws IOException, CollectionException {
147    // read in the data for the next document from the reader
148    documentData = new ArrayList<String>();
149    String line;
150    while ((line = reader.readLine()) != null && !line.startsWith(DOCSTART)) {
151      documentData.add(line.trim());
152    }
153
154    if (line == null)
155      hasNext = false;
156    else
157      line = reader.readLine().trim(); // advance past the blank line that follows
158                                       // "-DOCSTART- -X- O O"
159    // (we don't want an empty sentence on the front end!)
160    documentText = new StringBuffer();
161
162    initSentence();
163    tokenPosition = 0;
164    chunkStart = 0;
165    currentChunkType = "";
166    chunkTokens.clear();
167    namedEntityStart = 0;
168    currentNamedEntityType = "";
169    namedEntityTokens.clear();
170
171    for (String dataLine : documentData) {
172      if (dataLine.trim().equals("")) {
173        createChunk(jCas);
174        currentChunkType = "";
175        createNamedEntity(jCas);
176        currentNamedEntityType = "";
177
178        Sentence sentence = new Sentence(jCas, sentenceStart, documentText.length());
179        sentence.addToIndexes();
180
181        initSentence();
182      } else {
183        String[] dataPieces = dataLine.split(" ");
184        String tok = dataPieces[0];
185        String pos = dataPieces[1];
186        String chunkType = dataPieces[2];
187        if (currentChunkType.equals(""))
188          initChunk(chunkType);
189        String namedEntityType = dataPieces[3];
190        if (currentNamedEntityType.equals(""))
191          initNamedEntity(namedEntityType);
192
193        Token token = new Token(jCas, documentText.length(), documentText.length() + tok.length());
194        token.setPos(pos);
195        token.addToIndexes();
196
197        boolean chunkStartsWithB = startsWithB(currentChunkType, chunkType);
198        if (!chunkType.equals(currentChunkType) && !chunkStartsWithB) {
199          createChunk(jCas);
200          initChunk(chunkType);
201        }
202
203        boolean namedEntityStartsWithB = startsWithB(currentNamedEntityType, namedEntityType);
204
205        if (!namedEntityType.equals(currentNamedEntityType) && !namedEntityStartsWithB) {
206          createNamedEntity(jCas);
207          initNamedEntity(namedEntityType);
208        }
209
210        sentenceTokens.add(token);
211        chunkTokens.add(token);
212        namedEntityTokens.add(token);
213        documentText.append(tok + " ");
214      }
215    }
216
217    jCas.setDocumentText(documentText.toString());
218
219    URI fileURI = new File(dataFileName).toURI();
220    String fragment = String.valueOf(documentIndex);
221    URI uri;
222    try {
223      uri = new URI(fileURI.getScheme(), fileURI.getHost(), fileURI.getPath(), fragment);
224    } catch (URISyntaxException e) {
225      // should never reach this; fragment should always be valid since it's just a number
226      throw new RuntimeException(e);
227    }
228    ViewUriUtil.setURI(jCas, uri);
229    ++documentIndex;
230
231  }
232
233  private void initSentence() {
234    sentenceStart = documentText.length();
235    sentenceTokens.clear();
236    sentenceChunks.clear();
237  }
238
239  private void createChunk(JCas jCas) {
240    if (!currentChunkType.equals("O")) {
241      Chunk chunk = new Chunk(jCas, chunkStart, documentText.length() - 1);
242      chunk.setChunkType(currentChunkType.substring(2));
243      chunk.addToIndexes();
244      sentenceChunks.add(chunk);
245    }
246  }
247
248  private void initChunk(String chunkType) {
249    chunkStart = documentText.length();
250    chunkTokens.clear();
251    currentChunkType = chunkType;
252  }
253
254  private void createNamedEntity(JCas jCas) {
255    if (!currentNamedEntityType.equals("O") && loadNamedEntities) {
256      NamedEntity ne = new NamedEntity(jCas);
257      ne.setEntityClass("SPC");
258      ne.setEntityId("" + entityIdIndex++);
259      ne.setEntityType(currentNamedEntityType.substring(2));
260      ne.setEntitySubtype(currentNamedEntityType.substring(2));
261      ne.addToIndexes();
262
263      NamedEntityMention nem = new NamedEntityMention(
264          jCas,
265          namedEntityStart,
266          documentText.length() - 1);
267      nem.setMentionType("NAM");
268      Annotation annotation = new Annotation(jCas, namedEntityStart, documentText.length() - 1);
269      annotation.addToIndexes();
270      // Chunk chunk = new Chunk(jCas, namedEntityStart, documentText.length()-1);
271      // chunk.setTokens(UIMAUtil.toFSArray(jCas, namedEntityTokens));
272      // chunk.setChunkType("CoNLL NEM annotation");
273      // chunk.addToIndexes();
274      nem.setAnnotation(annotation);
275      nem.setHead(annotation);
276      nem.setMentionedEntity(ne);
277      nem.addToIndexes();
278
279      ne.setMentions(new FSArray(jCas, 1));
280      ne.setMentions(0, nem);
281    }
282  }
283
284  private void initNamedEntity(String namedEntityType) {
285    namedEntityStart = documentText.length();
286    namedEntityTokens.clear();
287    currentNamedEntityType = namedEntityType;
288  }
289
290  /**
291   * Determines if the read type is the same as the current type with the only difference being that
292   * the current type (bType) starts with "B-" and the read type starts with "I-".
293   * 
294   * @param bType
295   *          - the current type for the chunk or named entity
296   * @param iType
297   *          - the read chunk or named entity type for the token being examined.
298   * @return true if we should consder iType to be the same as bType so we know not to make a new
299   *         chunk or named entity.
300   */
301  private boolean startsWithB(String bType, String iType) {
302    if (bType.startsWith("B") && iType.startsWith("I")
303        && iType.substring(1).equals(bType.substring(1))) {
304      return true;
305    }
306    return false;
307  }
308
309  public void close() throws IOException {
310    reader.close();
311  }
312
313  public Progress[] getProgress() {
314    return new Progress[] { new ProgressImpl(documentIndex, 5000, Progress.ENTITIES) };
315  }
316
317  public boolean hasNext() throws IOException, CollectionException {
318    return hasNext;
319  }
320
321  public void setDataFileName(String dataFileName) {
322    this.dataFileName = dataFileName;
323  }
324
325  public void setLoadNamedEntities(boolean loadNamedEntities) {
326    this.loadNamedEntities = loadNamedEntities;
327  }
328
329}