001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.penntreebank; 025 026import java.io.File; 027import java.io.IOException; 028import java.util.Collections; 029import java.util.LinkedList; 030import java.util.List; 031 032import org.apache.uima.UimaContext; 033import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 034import org.apache.uima.collection.CollectionException; 035import org.apache.uima.jcas.JCas; 036import org.apache.uima.resource.ResourceInitializationException; 037import org.apache.uima.util.FileUtils; 038import org.apache.uima.util.Level; 039import org.apache.uima.util.Progress; 040import org.apache.uima.util.ProgressImpl; 041import org.cleartk.util.ViewUriUtil; 042import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 043import org.apache.uima.fit.component.ViewCreatorAnnotator; 044import org.apache.uima.fit.descriptor.ConfigurationParameter; 045import org.apache.uima.fit.descriptor.SofaCapability; 046 047import com.google.common.annotations.Beta; 048 049/** 050 * <p> 051 * PennTreebankReader reads in the PennTreebank (PTB) data distributed by the LDC. It simply reads 052 * the raw treebank data into a view called "TreebankView". To actually parse the treebank data and 053 * post it to the CAS, you will need to use the TreebankGoldAnnotator which does the real work of 054 * parsing the treebank format. In general, treebank data can be read in by a 055 * PlainTextCollectionReader or some other simple collection reader. This class exists because the 056 * PennTreebank has a specific directory structure that corresponds to sections which are often used 057 * in specific ways to conduct experiments - e.g. section 02-20 for training and sections 21-24 for 058 * testing. This collection reader makes it easy to read in specific sections for later processing. 059 * Only files ending with ".mrg" will be read in. 060 * </p> 061 * <p> 062 * The acronym WSJ stands for Wall Street Journal which is the source of the articles treebanked by 063 * PTB. 064 * </p> 065 * 066 * <br> 067 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 068 * All rights reserved. 069 * 070 * @author Philip Ogren, Philipp Wetzler 071 */ 072 073@SofaCapability(outputSofas = { PennTreebankReader.TREEBANK_VIEW, ViewUriUtil.URI }) 074public class PennTreebankReader extends JCasCollectionReader_ImplBase { 075 /** 076 * The view containing the parenthesized text of a TreeBank .mrg file. 077 */ 078 public static final String TREEBANK_VIEW = "TREEBANK_VIEW"; 079 080 public static final String PARAM_CORPUS_DIRECTORY_NAME = "corpusDirectoryName"; 081 082 private static final String CORPUS_DIRECTORY_DESCRIPTION = "Specifies the location of WSJ/PennTreebank treebank files. " 083 + "The directory should contain subdirectories corresponding to the sections (e.g. '00', '01', etc.) " 084 + "That is, if a local copy of PennTreebank sits at C:/Data/PTB/wsj/mrg, then the the subdirectory C:/Data/PTB/wsj/mrg/00 should exist. " 085 + "There are 24 sections in PTB corresponding to the directories 00, 01, 02, ... 24. "; 086 087 @ConfigurationParameter( 088 name = PARAM_CORPUS_DIRECTORY_NAME, 089 mandatory = true, description = CORPUS_DIRECTORY_DESCRIPTION) 090 private String corpusDirectoryName; 091 092 public static final String PARAM_SECTIONS_SPECIFIER = "sectionsSpecifier"; 093 094 private static final String SECTIONS_DESCRIPTION = "specifies which sections of PTB to read in. " 095 + "The required format for values of this parameter allows for comma-separated section numbers and section ranges, " 096 + "for example '02,07-12,16'."; 097 098 @ConfigurationParameter(name = PARAM_SECTIONS_SPECIFIER, defaultValue = "00-24", description = SECTIONS_DESCRIPTION) 099 private String sectionsSpecifier; 100 101 protected File directory; 102 103 protected LinkedList<File> files; 104 105 protected int numberOfFiles; 106 107 protected ListSpecification sections; 108 109 @Override 110 public void initialize(UimaContext context) throws ResourceInitializationException { 111 this.sections = new ListSpecification(sectionsSpecifier); 112 113 this.directory = new File(corpusDirectoryName); 114 this.files = new LinkedList<File>(); 115 collectSections(new File(directory.getPath()), this.files, this.sections); 116 Collections.sort(files); 117 this.numberOfFiles = files.size(); 118 119 } 120 121 /** 122 * This will add all the <tt>.mrg</tt> files in the given WSJ sections to <em>treebankFiles</em>. 123 * 124 * @param wsjDirectory 125 * The top level of the WSJ part of Treebank. Underneath here are the section 126 * subdirectories. 127 * @param treebankFiles 128 * The {@link List} to which the treebank files should be added. 129 * @param wsjSections 130 * The set of sections to include. 131 */ 132 @Beta 133 public static void collectSections( 134 File wsjDirectory, 135 List<File> treebankFiles, 136 ListSpecification wsjSections) { 137 if (!wsjDirectory.isDirectory()) 138 return; 139 140 for (File subFile : wsjDirectory.listFiles()) { 141 if (!subFile.isDirectory()) 142 continue; 143 144 try { 145 int section = Integer.valueOf(subFile.getName()); 146 147 if (!wsjSections.contains(section)) 148 continue; 149 } catch (NumberFormatException e) { 150 continue; 151 } 152 153 collectFiles(subFile, treebankFiles); 154 } 155 } 156 157 static void collectFiles(File file, List<File> treebankFiles) { 158 if (file.isFile() && file.getName().endsWith(".mrg")) { 159 treebankFiles.add(file); 160 } else if (file.isDirectory()) { 161 for (File subFile : file.listFiles()) { 162 collectFiles(subFile, treebankFiles); 163 } 164 } 165 } 166 167 /** 168 * Reads the next file and stores its text in <b>cas</b> as the "TreebankView" SOFA. 169 */ 170 public void getNext(JCas jCas) throws IOException, CollectionException { 171 File treebankFile = files.removeFirst(); 172 getUimaContext().getLogger().log( 173 Level.FINEST, 174 "reading treebank file: " + treebankFile.getPath()); 175 ViewUriUtil.setURI(jCas, treebankFile.toURI()); 176 try { 177 JCas treebankView = ViewCreatorAnnotator.createViewSafely( 178 jCas, 179 TREEBANK_VIEW); 180 treebankView.setSofaDataString(FileUtils.file2String(treebankFile), "text/plain"); 181 } catch (AnalysisEngineProcessException aepe) { 182 throw new CollectionException(aepe); 183 } 184 } 185 186 public void close() throws IOException { 187 } 188 189 public Progress[] getProgress() { 190 return new Progress[] { new ProgressImpl( 191 numberOfFiles - files.size(), 192 numberOfFiles, 193 Progress.ENTITIES) }; 194 } 195 196 public boolean hasNext() throws IOException, CollectionException { 197 if (files.size() > 0) 198 return true; 199 else 200 return false; 201 } 202 203 @Beta 204 public void setCorpusDirectoryName(String corpusDirectoryName) { 205 this.corpusDirectoryName = corpusDirectoryName; 206 } 207 208 @Beta 209 public void setSectionsSpecifier(String sectionsString) { 210 this.sectionsSpecifier = sectionsString; 211 } 212 213}