001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.timeml.tlink; 025 026import java.io.File; 027import java.io.IOException; 028 029import org.apache.uima.util.FileUtils; 030import org.apache.uima.util.Level; 031import org.apache.uima.util.Logger; 032import org.cleartk.corpus.timeml.PlainTextTlinkGoldAnnotator; 033import org.cleartk.corpus.timeml.TimeMlGoldAnnotator; 034import org.cleartk.corpus.timeml.TreebankAligningAnnotator; 035import org.cleartk.ml.jar.JarClassifierBuilder; 036import org.cleartk.snowball.DefaultSnowballStemmer; 037import org.cleartk.util.cr.FilesCollectionReader; 038import org.apache.uima.fit.factory.UimaContextFactory; 039import org.apache.uima.fit.pipeline.SimplePipeline; 040 041/** 042 * <br> 043 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 044 * All rights reserved. 045 * 046 * @author Steven Bethard 047 */ 048public class VerbClauseTemporalTrain { 049 050 private static void error(String message) throws Exception { 051 Logger logger = UimaContextFactory.createUimaContext().getLogger(); 052 logger.log(Level.SEVERE, String.format("%s\nusage: " 053 + "VerbClauseTemporalMain timebank-dir treebank-dir", message)); 054 System.exit(1); 055 } 056 057 public static void main(String[] args) throws Exception { 058 // check arguments 059 if (args.length != 2) { 060 error("wrong number of arguments"); 061 } else if (!new File(args[0]).exists()) { 062 error("TimeBank directory not found: " + args[0]); 063 } else if (!new File(args[1]).exists()) { 064 error("TreeBank directory not found: " + args[1]); 065 } 066 String timeBankDir = args[0]; 067 String treeBankDir = args[1]; 068 069 // clean up the mismatches between TimeBank and TreeBank 070 File cleanedTimeBankDir = getCleanedTimeBankDir(timeBankDir); 071 timeBankDir = cleanedTimeBankDir.getPath(); 072 073 // run the components that write out the training data 074 SimplePipeline.runPipeline( 075 FilesCollectionReader.getCollectionReaderWithPatterns( 076 timeBankDir, 077 TimeMlGoldAnnotator.TIMEML_VIEW_NAME, 078 "wsj_.*[.]tml"), 079 TimeMlGoldAnnotator.getDescriptionNoTLINKs(), 080 PlainTextTlinkGoldAnnotator.getDescription(), 081 TreebankAligningAnnotator.getDescription(treeBankDir), 082 DefaultSnowballStemmer.getDescription("English"), 083 VerbClauseTemporalAnnotator.FACTORY.getWriterDescription()); 084 085 // remove the temporary directory containing the cleaned up TimeBank 086 FileUtils.deleteRecursive(cleanedTimeBankDir); 087 088 // train the model 089 File trainingDirectory = VerbClauseTemporalAnnotator.FACTORY.getTrainingDirectory(); 090 JarClassifierBuilder.trainAndPackage(trainingDirectory); 091 092 // delete the generated files 093 for (File file : trainingDirectory.listFiles()) { 094 File modelFile = JarClassifierBuilder.getModelJarFile(trainingDirectory); 095 if (!file.isDirectory() && !file.equals(modelFile)) { 096 file.delete(); 097 } 098 } 099 } 100 101 public static File getCleanedTimeBankDir(String timeBankDir) throws IOException { 102 File tempDir = File.createTempFile("TimeBank", "Cleaned"); 103 tempDir.delete(); 104 tempDir.mkdir(); 105 for (File file : new File(timeBankDir).listFiles()) { 106 String name = file.getName(); 107 if (file.isHidden() || name.startsWith(".")) { 108 continue; 109 } 110 111 // get the file text 112 String text = FileUtils.file2String(file); 113 114 // all ampersands are messed up in TimeBank 115 text = text.replaceAll("\\bamp\\b", "&"); 116 text = text.replaceAll("SampP", "S&P"); 117 text = text.replaceAll("&&;", "&"); 118 119 // all "---" missing in TreeBank 120 text = text.replaceAll("---", ""); 121 122 // fix individual file errors 123 text = fixTextByFileName(name, text); 124 125 // write the file to the temp directory 126 FileUtils.saveString2File(text, new File(tempDir, file.getName())); 127 } 128 return tempDir; 129 130 } 131 132 public static String fixTextByFileName(String name, String text) { 133 // duplicate "the" in TimeBank 134 if (name.equals("wsj_0032.tml")) { 135 text = text.replace("the <TIMEX3 tid=\"t18\"", "<TIMEX3 tid=\"t18\""); 136 } 137 138 // missing "DD"s in TimeBank 139 else if (name.equals("wsj_0159.tml")) { 140 text = text.replace( 141 "Acquisition has <EVENT eid=\"e11\"", 142 "DD Acquisition has <EVENT eid=\"e11\""); 143 text = text.replace("Acquisition <EVENT eid=\"e20\"", "DD Acquisition <EVENT eid=\"e20\""); 144 } 145 146 // missing "BRUCE R. BENT" in TreeBank 147 else if (name.equals("wsj_0266.tml")) { 148 text = text.replace("BRUCE R. BENT", ""); 149 } 150 151 // missing 30. in TreeBank 152 else if (name.equals("wsj_0344.tml")) { 153 text = text.replace(" 30</TIMEX3>.", "</TIMEX3>"); 154 } 155 156 // reversed "off roughly" in TimeBank 157 else if (name.equals("wsj_0376.tml")) { 158 text = text.replace("roughly off", "off roughly"); 159 } 160 161 // missing @... lines in TreeBank 162 else if (name.equals("wsj_0586.tml")) { 163 text = text.replaceAll("(?m)@((?!</HL>).)*?$", ""); 164 } 165 166 // missing @CORPORATES and @EUROBONDS in TreeBank 167 else if (name.equals("wsj_0612.tml")) { 168 text = text.replace( 169 "@ <ENAMEX TYPE=\"ORGANIZATION\">CORPORATES", 170 "<ENAMEX TYPE=\"ORGANIZATION\">"); 171 text = text.replace( 172 "@ <ENAMEX TYPE=\"ORGANIZATION\">EUROBONDS", 173 "<ENAMEX TYPE=\"ORGANIZATION\">"); 174 } 175 176 // missing "1988." in TreeBank 177 else if (name.equals("wsj_0667.tml")) { 178 text = text.replace("1988</TIMEX3>.", "</TIMEX3>"); 179 } 180 181 // missing "--" in TimeBank and missing "19.29." in TreeBank 182 else if (name.equals("wsj_0675.tml")) { 183 text = text.replace("Markets</ENAMEX>", "Markets</ENAMEX> --"); 184 text = text.replace("19.29</CARDINAL>.", "</CARDINAL>"); 185 } 186 187 // reversed "definitely not" in TimeBank 188 else if (name.equals("wsj_0781.tml")) { 189 text = text.replace("not definitely", "definitely not"); 190 } 191 192 // really messed up text in TimeBank 193 else if (name.equals("wsj_1003.tml")) { 194 text = text.replace("a shhha55 cents a share, ents a share, but ssa share", "a share"); 195 text = text.replace( 196 "steel business, <EVENT eid=\"e109\"", 197 "Armco, hampered by lower volume in its specialty steel " 198 + "business, <EVENT eid=\"e109\""); 199 } 200 201 return text; 202 } 203}