001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.genia.pos; 025 026import java.io.File; 027import java.io.IOException; 028import java.net.URI; 029import java.net.URISyntaxException; 030import java.util.HashSet; 031import java.util.List; 032import java.util.Set; 033 034import org.apache.uima.UimaContext; 035import org.apache.uima.cas.CAS; 036import org.apache.uima.cas.CASException; 037import org.apache.uima.collection.CollectionException; 038import org.apache.uima.collection.CollectionReader; 039import org.apache.uima.jcas.JCas; 040import org.apache.uima.pear.util.FileUtil; 041import org.apache.uima.resource.ResourceInitializationException; 042import org.apache.uima.util.Progress; 043import org.apache.uima.util.ProgressImpl; 044import org.cleartk.corpus.genia.pos.util.GeniaPosParser; 045import org.cleartk.corpus.genia.pos.util.GeniaParse; 046import org.cleartk.corpus.genia.pos.util.GeniaSentence; 047import org.cleartk.corpus.genia.pos.util.GeniaTag; 048import org.cleartk.corpus.genia.pos.util.Span; 049import org.cleartk.token.type.Sentence; 050import org.cleartk.token.type.Token; 051import org.cleartk.util.ViewUriUtil; 052import org.jdom2.JDOMException; 053import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 054import org.apache.uima.fit.descriptor.ConfigurationParameter; 055import org.apache.uima.fit.descriptor.SofaCapability; 056import org.apache.uima.fit.factory.CollectionReaderFactory; 057 058/** 059 * <br> 060 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 061 * All rights reserved. 062 * 063 * <p> 064 * 065 * @author Philip V. Ogren 066 * @see GeniaPosParser 067 */ 068@SofaCapability(outputSofas = { ViewUriUtil.URI, GeniaPosViewName.GENIA_POS }) 069public class GeniaPosGoldReader extends JCasCollectionReader_ImplBase { 070 071 public static final String GENIA_POS_VIEW = "GeniaPOSView"; 072 073 public static final String PARAM_GENIA_CORPUS_FILE = "geniaCorpusFile"; 074 075 @ConfigurationParameter( 076 name = PARAM_GENIA_CORPUS_FILE, 077 description = "names the file that is the Genia corpus to be loaded. A good value is probably '.../GENIAcorpus3.02.pos.xml'. Please see README in this directory for edits that you may need to make to this file manually.", 078 mandatory = true) 079 private File geniaCorpusFile; 080 081 public static final String PARAM_LOAD_SENTENCES = "loadSentences"; 082 083 @ConfigurationParameter( 084 name = PARAM_LOAD_SENTENCES, 085 mandatory = false, 086 description = "determines whether sentence annotations will be added from the Genia corpus.", 087 defaultValue = "true") 088 private boolean loadSentences = true; 089 090 public static final String PARAM_LOAD_TOKENS = "loadTokens"; 091 092 @ConfigurationParameter( 093 name = PARAM_LOAD_TOKENS, 094 mandatory = false, 095 description = "determines whether tokens annotations will be added from the Genia corpus. ", 096 defaultValue = "true") 097 private boolean loadTokens = true; 098 099 public static final String PARAM_LOAD_POS_TAGS = "loadPosTags"; 100 101 @ConfigurationParameter( 102 name = PARAM_LOAD_POS_TAGS, 103 mandatory = false, 104 description = "determines whether the part of speech tags assigned to each token in the genia corpus will be loaded. The default value of 'true' is used if this " 105 + "parameter is unspecified. If 'loadTokens' is 'false', then 'loadPOSTags' will be treated as 'false' regardless of what is given in the descriptor file.", 106 defaultValue = "true") 107 private boolean loadPosTags = true; 108 109 public static final String PARAM_ARTICLE_IDS_LIST_FILE = "articleIdsListFile"; 110 111 @ConfigurationParameter( 112 name = PARAM_ARTICLE_IDS_LIST_FILE, 113 mandatory = false, 114 description = "names the file used to specify the article ids that should be read in") 115 File articleIdsListFile; 116 117 private boolean filterArticles; 118 119 private Set<String> articleIds; 120 121 private GeniaPosParser parser; 122 123 private GeniaParse parse; 124 125 private int progress = 0; 126 127 @Override 128 public void initialize(UimaContext context) throws ResourceInitializationException { 129 130 articleIds = new HashSet<String>(); 131 132 try { 133 if (articleIdsListFile == null) { 134 filterArticles = false; 135 } else { 136 filterArticles = true; 137 String[] ids = FileUtil.loadListOfStrings(articleIdsListFile); 138 for (String id : ids) { 139 articleIds.add(id); 140 } 141 } 142 143 parser = new GeniaPosParser(geniaCorpusFile); 144 loadPosTags = loadTokens & loadPosTags; 145 } catch (IOException ioe) { 146 throw new ResourceInitializationException(ioe); 147 } catch (JDOMException je) { 148 throw new ResourceInitializationException(je); 149 } 150 } 151 152 public void getNext(JCas jCas) throws IOException, CollectionException { 153 if (!hasNext()) 154 throw new CollectionException( 155 "Should not be calling getNext() because hasNext returns false", 156 null); 157 try { 158 JCas annotationsView = jCas.getView(CAS.NAME_DEFAULT_SOFA); 159 String text = parse.getText(); 160 annotationsView.setDocumentText(text); 161 162 List<GeniaSentence> sentences = parse.getSentences(); 163 164 for (GeniaSentence geniaSentence : sentences) { 165 if (loadTokens) { 166 List<GeniaTag> posTags = geniaSentence.getPosTags(); 167 for (GeniaTag posTag : posTags) { 168 Span tokenSpan = posTag.getSpans().get(0); 169 Token token = new Token(annotationsView, tokenSpan.getBegin(), tokenSpan.getEnd()); 170 if (loadPosTags) 171 token.setPos(posTag.getLabel()); 172 token.addToIndexes(); 173 } 174 } 175 if (loadSentences) { 176 Sentence sentence = new Sentence( 177 annotationsView, 178 geniaSentence.getSpan().getBegin(), 179 geniaSentence.getSpan().getEnd()); 180 sentence.addToIndexes(); 181 } 182 } 183 184 URI fileURI = this.geniaCorpusFile.toURI(); 185 String fragment = this.parse.getMedline(); 186 URI uri; 187 try { 188 uri = new URI(fileURI.getScheme(), fileURI.getHost(), fileURI.getPath(), fragment); 189 } catch (URISyntaxException e) { 190 // should never reach this; fragment should always be valid since it's just a number 191 throw new RuntimeException(e); 192 } 193 ViewUriUtil.setURI(jCas, uri); 194 195 JCas geniaView = jCas.createView(GeniaPosViewName.GENIA_POS); 196 geniaView.setDocumentText(parse.getXml()); 197 198 parse = null; 199 } catch (CASException ce) { 200 throw new CollectionException(ce); 201 } 202 203 } 204 205 public void close() throws IOException { 206 } 207 208 public Progress[] getProgress() { 209 if (filterArticles) { 210 return new Progress[] { new ProgressImpl(progress, articleIds.size(), Progress.ENTITIES) }; 211 } else { 212 return new Progress[] { new ProgressImpl(progress, 2000, Progress.ENTITIES) }; 213 } 214 } 215 216 public boolean hasNext() throws IOException, CollectionException { 217 if (parse != null) 218 return true; 219 while (parser.hasNext()) { 220 parse = parser.next(); 221 if (!filterArticles) { 222 progress++; 223 return true; 224 } 225 if (articleIds.contains(parse.getMedline())) { 226 progress++; 227 return true; 228 } 229 } 230 return false; 231 } 232 233 public static CollectionReader getDescription(String geniaCorpusFile) 234 throws ResourceInitializationException { 235 return CollectionReaderFactory.createReader( 236 GeniaPosGoldReader.class, 237 GeniaPosGoldReader.PARAM_GENIA_CORPUS_FILE, 238 geniaCorpusFile); 239 } 240 241 public static String[] TEST_FOLDS = new String[] { 242 "resources/genia/article_ids/fold-1-test.txt", 243 "resources/genia/article_ids/fold-2-test.txt", 244 "resources/genia/article_ids/fold-3-test.txt", 245 "resources/genia/article_ids/fold-4-test.txt", 246 "resources/genia/article_ids/fold-5-test.txt", 247 "resources/genia/article_ids/fold-6-test.txt", 248 "resources/genia/article_ids/fold-7-test.txt", 249 "resources/genia/article_ids/fold-8-test.txt", 250 "resources/genia/article_ids/fold-9-test.txt", 251 "resources/genia/article_ids/fold-10-test.txt", }; 252 253 public static String[] TRAIN_FOLDS = new String[] { 254 "resources/genia/article_ids/fold-1-train.txt", 255 "resources/genia/article_ids/fold-2-train.txt", 256 "resources/genia/article_ids/fold-3-train.txt", 257 "resources/genia/article_ids/fold-4-train.txt", 258 "resources/genia/article_ids/fold-5-train.txt", 259 "resources/genia/article_ids/fold-6-train.txt", 260 "resources/genia/article_ids/fold-7-train.txt", 261 "resources/genia/article_ids/fold-8-train.txt", 262 "resources/genia/article_ids/fold-9-train.txt", 263 "resources/genia/article_ids/fold-10-train.txt", }; 264 265}