001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.penntreebank;
025
026import java.io.File;
027import java.io.IOException;
028import java.util.Collections;
029import java.util.LinkedList;
030import java.util.List;
031
032import org.apache.uima.UimaContext;
033import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
034import org.apache.uima.collection.CollectionException;
035import org.apache.uima.jcas.JCas;
036import org.apache.uima.resource.ResourceInitializationException;
037import org.apache.uima.util.FileUtils;
038import org.apache.uima.util.Level;
039import org.apache.uima.util.Progress;
040import org.apache.uima.util.ProgressImpl;
041import org.cleartk.util.ViewUriUtil;
042import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
043import org.apache.uima.fit.component.ViewCreatorAnnotator;
044import org.apache.uima.fit.descriptor.ConfigurationParameter;
045import org.apache.uima.fit.descriptor.SofaCapability;
046
047import com.google.common.annotations.Beta;
048
049/**
050 * <p>
051 * PennTreebankReader reads in the PennTreebank (PTB) data distributed by the LDC. It simply reads
052 * the raw treebank data into a view called "TreebankView". To actually parse the treebank data and
053 * post it to the CAS, you will need to use the TreebankGoldAnnotator which does the real work of
054 * parsing the treebank format. In general, treebank data can be read in by a
055 * PlainTextCollectionReader or some other simple collection reader. This class exists because the
056 * PennTreebank has a specific directory structure that corresponds to sections which are often used
057 * in specific ways to conduct experiments - e.g. section 02-20 for training and sections 21-24 for
058 * testing. This collection reader makes it easy to read in specific sections for later processing.
059 * Only files ending with ".mrg" will be read in.
060 * </p>
061 * <p>
062 * The acronym WSJ stands for Wall Street Journal which is the source of the articles treebanked by
063 * PTB.
064 * </p>
065 * 
066 * <br>
067 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
068 * All rights reserved.
069 * 
070 * @author Philip Ogren, Philipp Wetzler
071 */
072
073@SofaCapability(outputSofas = { PennTreebankReader.TREEBANK_VIEW, ViewUriUtil.URI })
074public class PennTreebankReader extends JCasCollectionReader_ImplBase {
075  /**
076   * The view containing the parenthesized text of a TreeBank .mrg file.
077   */
078  public static final String TREEBANK_VIEW = "TREEBANK_VIEW";
079
080  public static final String PARAM_CORPUS_DIRECTORY_NAME = "corpusDirectoryName";
081
082  private static final String CORPUS_DIRECTORY_DESCRIPTION = "Specifies the location of WSJ/PennTreebank treebank files.  "
083      + "The directory should contain subdirectories corresponding to the sections (e.g. '00', '01', etc.) "
084      + "That is, if a local copy of PennTreebank sits at C:/Data/PTB/wsj/mrg, then the the subdirectory C:/Data/PTB/wsj/mrg/00 should exist. "
085      + "There are 24 sections in PTB corresponding to the directories 00, 01, 02, ... 24. ";
086
087  @ConfigurationParameter(
088      name = PARAM_CORPUS_DIRECTORY_NAME,
089      mandatory = true, description = CORPUS_DIRECTORY_DESCRIPTION)
090  private String corpusDirectoryName;
091
092  public static final String PARAM_SECTIONS_SPECIFIER = "sectionsSpecifier";
093
094  private static final String SECTIONS_DESCRIPTION = "specifies which sections of PTB to read in.  "
095      + "The required format for values of this parameter allows for comma-separated section numbers and section ranges, "
096      + "for example '02,07-12,16'.";
097
098  @ConfigurationParameter(name = PARAM_SECTIONS_SPECIFIER, defaultValue = "00-24", description = SECTIONS_DESCRIPTION)
099  private String sectionsSpecifier;
100
101  protected File directory;
102
103  protected LinkedList<File> files;
104
105  protected int numberOfFiles;
106
107  protected ListSpecification sections;
108
109  @Override
110  public void initialize(UimaContext context) throws ResourceInitializationException {
111    this.sections = new ListSpecification(sectionsSpecifier);
112
113    this.directory = new File(corpusDirectoryName);
114    this.files = new LinkedList<File>();
115    collectSections(new File(directory.getPath()), this.files, this.sections);
116    Collections.sort(files);
117    this.numberOfFiles = files.size();
118
119  }
120
121  /**
122   * This will add all the <tt>.mrg</tt> files in the given WSJ sections to <em>treebankFiles</em>.
123   * 
124   * @param wsjDirectory
125   *          The top level of the WSJ part of Treebank. Underneath here are the section
126   *          subdirectories.
127   * @param treebankFiles
128   *          The {@link List} to which the treebank files should be added.
129   * @param wsjSections
130   *          The set of sections to include.
131   */
132  @Beta
133  public static void collectSections(
134      File wsjDirectory,
135      List<File> treebankFiles,
136      ListSpecification wsjSections) {
137    if (!wsjDirectory.isDirectory())
138      return;
139
140    for (File subFile : wsjDirectory.listFiles()) {
141      if (!subFile.isDirectory())
142        continue;
143
144      try {
145        int section = Integer.valueOf(subFile.getName());
146
147        if (!wsjSections.contains(section))
148          continue;
149      } catch (NumberFormatException e) {
150        continue;
151      }
152
153      collectFiles(subFile, treebankFiles);
154    }
155  }
156
157  static void collectFiles(File file, List<File> treebankFiles) {
158    if (file.isFile() && file.getName().endsWith(".mrg")) {
159      treebankFiles.add(file);
160    } else if (file.isDirectory()) {
161      for (File subFile : file.listFiles()) {
162        collectFiles(subFile, treebankFiles);
163      }
164    }
165  }
166
167  /**
168   * Reads the next file and stores its text in <b>cas</b> as the "TreebankView" SOFA.
169   */
170  public void getNext(JCas jCas) throws IOException, CollectionException {
171    File treebankFile = files.removeFirst();
172    getUimaContext().getLogger().log(
173        Level.FINEST,
174        "reading treebank file: " + treebankFile.getPath());
175    ViewUriUtil.setURI(jCas, treebankFile.toURI());
176    try {
177      JCas treebankView = ViewCreatorAnnotator.createViewSafely(
178          jCas,
179          TREEBANK_VIEW);
180      treebankView.setSofaDataString(FileUtils.file2String(treebankFile), "text/plain");
181    } catch (AnalysisEngineProcessException aepe) {
182      throw new CollectionException(aepe);
183    }
184  }
185
186  public void close() throws IOException {
187  }
188
189  public Progress[] getProgress() {
190    return new Progress[] { new ProgressImpl(
191        numberOfFiles - files.size(),
192        numberOfFiles,
193        Progress.ENTITIES) };
194  }
195
196  public boolean hasNext() throws IOException, CollectionException {
197    if (files.size() > 0)
198      return true;
199    else
200      return false;
201  }
202
203  @Beta
204  public void setCorpusDirectoryName(String corpusDirectoryName) {
205    this.corpusDirectoryName = corpusDirectoryName;
206  }
207
208  @Beta
209  public void setSectionsSpecifier(String sectionsString) {
210    this.sectionsSpecifier = sectionsString;
211  }
212
213}