001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.timeml.tlink;
025
026import java.io.File;
027import java.io.IOException;
028
029import org.apache.uima.util.FileUtils;
030import org.apache.uima.util.Level;
031import org.apache.uima.util.Logger;
032import org.cleartk.corpus.timeml.PlainTextTlinkGoldAnnotator;
033import org.cleartk.corpus.timeml.TimeMlGoldAnnotator;
034import org.cleartk.corpus.timeml.TreebankAligningAnnotator;
035import org.cleartk.ml.jar.JarClassifierBuilder;
036import org.cleartk.snowball.DefaultSnowballStemmer;
037import org.cleartk.util.cr.FilesCollectionReader;
038import org.apache.uima.fit.factory.UimaContextFactory;
039import org.apache.uima.fit.pipeline.SimplePipeline;
040
041/**
042 * <br>
043 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
044 * All rights reserved.
045 * 
046 * @author Steven Bethard
047 */
048public class VerbClauseTemporalTrain {
049
050  private static void error(String message) throws Exception {
051    Logger logger = UimaContextFactory.createUimaContext().getLogger();
052    logger.log(Level.SEVERE, String.format("%s\nusage: "
053        + "VerbClauseTemporalMain timebank-dir treebank-dir", message));
054    System.exit(1);
055  }
056
057  public static void main(String[] args) throws Exception {
058    // check arguments
059    if (args.length != 2) {
060      error("wrong number of arguments");
061    } else if (!new File(args[0]).exists()) {
062      error("TimeBank directory not found: " + args[0]);
063    } else if (!new File(args[1]).exists()) {
064      error("TreeBank directory not found: " + args[1]);
065    }
066    String timeBankDir = args[0];
067    String treeBankDir = args[1];
068
069    // clean up the mismatches between TimeBank and TreeBank
070    File cleanedTimeBankDir = getCleanedTimeBankDir(timeBankDir);
071    timeBankDir = cleanedTimeBankDir.getPath();
072
073    // run the components that write out the training data
074    SimplePipeline.runPipeline(
075        FilesCollectionReader.getCollectionReaderWithPatterns(
076            timeBankDir,
077            TimeMlGoldAnnotator.TIMEML_VIEW_NAME,
078            "wsj_.*[.]tml"),
079        TimeMlGoldAnnotator.getDescriptionNoTLINKs(),
080        PlainTextTlinkGoldAnnotator.getDescription(),
081        TreebankAligningAnnotator.getDescription(treeBankDir),
082        DefaultSnowballStemmer.getDescription("English"),
083        VerbClauseTemporalAnnotator.FACTORY.getWriterDescription());
084
085    // remove the temporary directory containing the cleaned up TimeBank
086    FileUtils.deleteRecursive(cleanedTimeBankDir);
087
088    // train the model
089    File trainingDirectory = VerbClauseTemporalAnnotator.FACTORY.getTrainingDirectory();
090    JarClassifierBuilder.trainAndPackage(trainingDirectory);
091
092    // delete the generated files
093    for (File file : trainingDirectory.listFiles()) {
094      File modelFile = JarClassifierBuilder.getModelJarFile(trainingDirectory);
095      if (!file.isDirectory() && !file.equals(modelFile)) {
096        file.delete();
097      }
098    }
099  }
100
101  public static File getCleanedTimeBankDir(String timeBankDir) throws IOException {
102    File tempDir = File.createTempFile("TimeBank", "Cleaned");
103    tempDir.delete();
104    tempDir.mkdir();
105    for (File file : new File(timeBankDir).listFiles()) {
106      String name = file.getName();
107      if (file.isHidden() || name.startsWith(".")) {
108        continue;
109      }
110
111      // get the file text
112      String text = FileUtils.file2String(file);
113
114      // all ampersands are messed up in TimeBank
115      text = text.replaceAll("\\bamp\\b", "&amp;");
116      text = text.replaceAll("SampP", "S&amp;P");
117      text = text.replaceAll("&&amp;;", "&amp;");
118
119      // all "---" missing in TreeBank
120      text = text.replaceAll("---", "");
121
122      // fix individual file errors
123      text = fixTextByFileName(name, text);
124
125      // write the file to the temp directory
126      FileUtils.saveString2File(text, new File(tempDir, file.getName()));
127    }
128    return tempDir;
129
130  }
131
132  public static String fixTextByFileName(String name, String text) {
133    // duplicate "the" in TimeBank
134    if (name.equals("wsj_0032.tml")) {
135      text = text.replace("the <TIMEX3 tid=\"t18\"", "<TIMEX3 tid=\"t18\"");
136    }
137
138    // missing "DD"s in TimeBank
139    else if (name.equals("wsj_0159.tml")) {
140      text = text.replace(
141          "Acquisition has <EVENT eid=\"e11\"",
142          "DD Acquisition has <EVENT eid=\"e11\"");
143      text = text.replace("Acquisition <EVENT eid=\"e20\"", "DD Acquisition <EVENT eid=\"e20\"");
144    }
145
146    // missing "BRUCE R. BENT" in TreeBank
147    else if (name.equals("wsj_0266.tml")) {
148      text = text.replace("BRUCE R. BENT", "");
149    }
150
151    // missing 30. in TreeBank
152    else if (name.equals("wsj_0344.tml")) {
153      text = text.replace(" 30</TIMEX3>.", "</TIMEX3>");
154    }
155
156    // reversed "off roughly" in TimeBank
157    else if (name.equals("wsj_0376.tml")) {
158      text = text.replace("roughly off", "off roughly");
159    }
160
161    // missing @... lines in TreeBank
162    else if (name.equals("wsj_0586.tml")) {
163      text = text.replaceAll("(?m)@((?!</HL>).)*?$", "");
164    }
165
166    // missing @CORPORATES and @EUROBONDS in TreeBank
167    else if (name.equals("wsj_0612.tml")) {
168      text = text.replace(
169          "@ <ENAMEX TYPE=\"ORGANIZATION\">CORPORATES",
170          "<ENAMEX TYPE=\"ORGANIZATION\">");
171      text = text.replace(
172          "@ <ENAMEX TYPE=\"ORGANIZATION\">EUROBONDS",
173          "<ENAMEX TYPE=\"ORGANIZATION\">");
174    }
175
176    // missing "1988." in TreeBank
177    else if (name.equals("wsj_0667.tml")) {
178      text = text.replace("1988</TIMEX3>.", "</TIMEX3>");
179    }
180
181    // missing "--" in TimeBank and missing "19.29." in TreeBank
182    else if (name.equals("wsj_0675.tml")) {
183      text = text.replace("Markets</ENAMEX>", "Markets</ENAMEX> --");
184      text = text.replace("19.29</CARDINAL>.", "</CARDINAL>");
185    }
186
187    // reversed "definitely not" in TimeBank
188    else if (name.equals("wsj_0781.tml")) {
189      text = text.replace("not definitely", "definitely not");
190    }
191
192    // really messed up text in TimeBank
193    else if (name.equals("wsj_1003.tml")) {
194      text = text.replace("a shhha55 cents a share,   ents a share, but  ssa share", "a share");
195      text = text.replace(
196          "steel business, <EVENT eid=\"e109\"",
197          "Armco, hampered by lower volume in its specialty steel "
198              + "business, <EVENT eid=\"e109\"");
199    }
200
201    return text;
202  }
203}