001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.genia.pos;
025
026import java.io.File;
027import java.io.IOException;
028import java.net.URI;
029import java.net.URISyntaxException;
030import java.util.HashSet;
031import java.util.List;
032import java.util.Set;
033
034import org.apache.uima.UimaContext;
035import org.apache.uima.cas.CAS;
036import org.apache.uima.cas.CASException;
037import org.apache.uima.collection.CollectionException;
038import org.apache.uima.collection.CollectionReader;
039import org.apache.uima.jcas.JCas;
040import org.apache.uima.pear.util.FileUtil;
041import org.apache.uima.resource.ResourceInitializationException;
042import org.apache.uima.util.Progress;
043import org.apache.uima.util.ProgressImpl;
044import org.cleartk.corpus.genia.pos.util.GeniaPosParser;
045import org.cleartk.corpus.genia.pos.util.GeniaParse;
046import org.cleartk.corpus.genia.pos.util.GeniaSentence;
047import org.cleartk.corpus.genia.pos.util.GeniaTag;
048import org.cleartk.corpus.genia.pos.util.Span;
049import org.cleartk.token.type.Sentence;
050import org.cleartk.token.type.Token;
051import org.cleartk.util.ViewUriUtil;
052import org.jdom2.JDOMException;
053import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
054import org.apache.uima.fit.descriptor.ConfigurationParameter;
055import org.apache.uima.fit.descriptor.SofaCapability;
056import org.apache.uima.fit.factory.CollectionReaderFactory;
057
058/**
059 * <br>
060 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
061 * All rights reserved.
062 * 
063 * <p>
064 * 
065 * @author Philip V. Ogren
066 * @see GeniaPosParser
067 */
068@SofaCapability(outputSofas = { ViewUriUtil.URI, GeniaPosViewName.GENIA_POS })
069public class GeniaPosGoldReader extends JCasCollectionReader_ImplBase {
070
071  public static final String GENIA_POS_VIEW = "GeniaPOSView";
072
073  public static final String PARAM_GENIA_CORPUS_FILE = "geniaCorpusFile";
074
075  @ConfigurationParameter(
076      name = PARAM_GENIA_CORPUS_FILE,
077      description = "names the file that is the Genia corpus to be loaded. A good value is probably '.../GENIAcorpus3.02.pos.xml'.  Please see README in this directory for edits that you may need to make to this file manually.",
078      mandatory = true)
079  private File geniaCorpusFile;
080
081  public static final String PARAM_LOAD_SENTENCES = "loadSentences";
082
083  @ConfigurationParameter(
084      name = PARAM_LOAD_SENTENCES,
085      mandatory = false,
086      description = "determines whether sentence annotations will be added from the Genia corpus.",
087      defaultValue = "true")
088  private boolean loadSentences = true;
089
090  public static final String PARAM_LOAD_TOKENS = "loadTokens";
091
092  @ConfigurationParameter(
093      name = PARAM_LOAD_TOKENS,
094      mandatory = false,
095      description = "determines whether tokens annotations will be added from the Genia corpus. ",
096      defaultValue = "true")
097  private boolean loadTokens = true;
098
099  public static final String PARAM_LOAD_POS_TAGS = "loadPosTags";
100
101  @ConfigurationParameter(
102      name = PARAM_LOAD_POS_TAGS,
103      mandatory = false,
104      description = "determines whether the part of speech tags assigned to each token in the genia corpus will be loaded. The default value of 'true' is used if this "
105          + "parameter is unspecified. If 'loadTokens' is 'false', then 'loadPOSTags' will be treated as 'false' regardless of what is given in the descriptor file.",
106      defaultValue = "true")
107  private boolean loadPosTags = true;
108
109  public static final String PARAM_ARTICLE_IDS_LIST_FILE = "articleIdsListFile";
110
111  @ConfigurationParameter(
112      name = PARAM_ARTICLE_IDS_LIST_FILE,
113      mandatory = false,
114      description = "names the file used to specify the article ids that should be read in")
115  File articleIdsListFile;
116
117  private boolean filterArticles;
118
119  private Set<String> articleIds;
120
121  private GeniaPosParser parser;
122
123  private GeniaParse parse;
124
125  private int progress = 0;
126
127  @Override
128  public void initialize(UimaContext context) throws ResourceInitializationException {
129
130    articleIds = new HashSet<String>();
131
132    try {
133      if (articleIdsListFile == null) {
134        filterArticles = false;
135      } else {
136        filterArticles = true;
137        String[] ids = FileUtil.loadListOfStrings(articleIdsListFile);
138        for (String id : ids) {
139          articleIds.add(id);
140        }
141      }
142
143      parser = new GeniaPosParser(geniaCorpusFile);
144      loadPosTags = loadTokens & loadPosTags;
145    } catch (IOException ioe) {
146      throw new ResourceInitializationException(ioe);
147    } catch (JDOMException je) {
148      throw new ResourceInitializationException(je);
149    }
150  }
151
152  public void getNext(JCas jCas) throws IOException, CollectionException {
153    if (!hasNext())
154      throw new CollectionException(
155          "Should not be calling getNext() because hasNext returns false",
156          null);
157    try {
158      JCas annotationsView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
159      String text = parse.getText();
160      annotationsView.setDocumentText(text);
161
162      List<GeniaSentence> sentences = parse.getSentences();
163
164      for (GeniaSentence geniaSentence : sentences) {
165        if (loadTokens) {
166          List<GeniaTag> posTags = geniaSentence.getPosTags();
167          for (GeniaTag posTag : posTags) {
168            Span tokenSpan = posTag.getSpans().get(0);
169            Token token = new Token(annotationsView, tokenSpan.getBegin(), tokenSpan.getEnd());
170            if (loadPosTags)
171              token.setPos(posTag.getLabel());
172            token.addToIndexes();
173          }
174        }
175        if (loadSentences) {
176          Sentence sentence = new Sentence(
177              annotationsView,
178              geniaSentence.getSpan().getBegin(),
179              geniaSentence.getSpan().getEnd());
180          sentence.addToIndexes();
181        }
182      }
183
184      URI fileURI = this.geniaCorpusFile.toURI();
185      String fragment = this.parse.getMedline();
186      URI uri;
187      try {
188        uri = new URI(fileURI.getScheme(), fileURI.getHost(), fileURI.getPath(), fragment);
189      } catch (URISyntaxException e) {
190        // should never reach this; fragment should always be valid since it's just a number
191        throw new RuntimeException(e);
192      }
193      ViewUriUtil.setURI(jCas, uri);
194
195      JCas geniaView = jCas.createView(GeniaPosViewName.GENIA_POS);
196      geniaView.setDocumentText(parse.getXml());
197
198      parse = null;
199    } catch (CASException ce) {
200      throw new CollectionException(ce);
201    }
202
203  }
204
205  public void close() throws IOException {
206  }
207
208  public Progress[] getProgress() {
209    if (filterArticles) {
210      return new Progress[] { new ProgressImpl(progress, articleIds.size(), Progress.ENTITIES) };
211    } else {
212      return new Progress[] { new ProgressImpl(progress, 2000, Progress.ENTITIES) };
213    }
214  }
215
216  public boolean hasNext() throws IOException, CollectionException {
217    if (parse != null)
218      return true;
219    while (parser.hasNext()) {
220      parse = parser.next();
221      if (!filterArticles) {
222        progress++;
223        return true;
224      }
225      if (articleIds.contains(parse.getMedline())) {
226        progress++;
227        return true;
228      }
229    }
230    return false;
231  }
232
233  public static CollectionReader getDescription(String geniaCorpusFile)
234      throws ResourceInitializationException {
235    return CollectionReaderFactory.createReader(
236        GeniaPosGoldReader.class,
237        GeniaPosGoldReader.PARAM_GENIA_CORPUS_FILE,
238        geniaCorpusFile);
239  }
240
241  public static String[] TEST_FOLDS = new String[] {
242      "resources/genia/article_ids/fold-1-test.txt",
243      "resources/genia/article_ids/fold-2-test.txt",
244      "resources/genia/article_ids/fold-3-test.txt",
245      "resources/genia/article_ids/fold-4-test.txt",
246      "resources/genia/article_ids/fold-5-test.txt",
247      "resources/genia/article_ids/fold-6-test.txt",
248      "resources/genia/article_ids/fold-7-test.txt",
249      "resources/genia/article_ids/fold-8-test.txt",
250      "resources/genia/article_ids/fold-9-test.txt",
251      "resources/genia/article_ids/fold-10-test.txt", };
252
253  public static String[] TRAIN_FOLDS = new String[] {
254      "resources/genia/article_ids/fold-1-train.txt",
255      "resources/genia/article_ids/fold-2-train.txt",
256      "resources/genia/article_ids/fold-3-train.txt",
257      "resources/genia/article_ids/fold-4-train.txt",
258      "resources/genia/article_ids/fold-5-train.txt",
259      "resources/genia/article_ids/fold-6-train.txt",
260      "resources/genia/article_ids/fold-7-train.txt",
261      "resources/genia/article_ids/fold-8-train.txt",
262      "resources/genia/article_ids/fold-9-train.txt",
263      "resources/genia/article_ids/fold-10-train.txt", };
264
265}