001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.genia.pos.util; 025 026import java.io.File; 027import java.io.IOException; 028import java.io.StringWriter; 029import java.util.ArrayList; 030import java.util.Collections; 031import java.util.HashSet; 032import java.util.Iterator; 033import java.util.List; 034import java.util.Set; 035 036import org.jdom2.Content; 037import org.jdom2.Element; 038import org.jdom2.JDOMException; 039import org.jdom2.Text; 040import org.jdom2.input.SAXBuilder; 041import org.jdom2.output.XMLOutputter; 042 043/** 044 * <br> 045 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 046 * All rights reserved. 047 * 048 * <p> 049 * 050 * 051 * This class parses the file GENIAcorpus3.02.pos.xml which provides sentence, word, and 052 * part-of-speech data. This parser maintains the whitespace found in the xml file so that the text 053 * added to the CAS does not come out as: 054 * <p> 055 * <code>"... of anti- Ro(SSA) antibodies . A pair of restriction "</code> 056 * <p> 057 * but instead comes out as: 058 * <p> 059 * <code>"... of anti-Ro(SSA) antibodies. A pair of restriction "</code> 060 * <p> 061 * 062 * There is no white space provided between sentences provided by the genia corpus. So, this parser 063 * simply adds in two spaces between each sentence. It also adds two newlines between the title and 064 * the body of the abstract. 065 * <p> 066 * The parses returned by this parser will not have any named entities - i.e. there will be now 067 * values returned from GeniaParse.getSemTags(). 068 * 069 * <p> 070 * About 4000 word (w) tags have a part-of-speech assignment "*" which I refer to as the wildcard 071 * part-of-speech tag. An example is: 072 * 073 * <pre> 074 * <w c="*">Ras</w><w c="NN">/protein</w> 075 * </pre> 076 * 077 * The above tags are parsed as a single token Ras/protein with the tag "NN". 078 * 079 * @author Philip V. Ogren 080 */ 081 082public class GeniaPosParser implements Iterator<GeniaParse> { 083 084 Element root; 085 086 Iterator<?> articles; 087 088 Set<String> posLabels; 089 090 XMLOutputter outputter; 091 092 public GeniaPosParser(File xmlFile) throws IOException, JDOMException { 093 this(); 094 SAXBuilder builder = new SAXBuilder(); 095 builder.setDTDHandler(null); 096 root = builder.build(xmlFile).getRootElement(); 097 articles = root.getChildren("article").iterator(); 098 outputter = new XMLOutputter(); 099 } 100 101 public GeniaPosParser() { 102 posLabels = new HashSet<String>(); 103 } 104 105 public boolean hasNext() { 106 return articles.hasNext(); 107 } 108 109 public GeniaParse next() { 110 return parse((Element) articles.next()); 111 } 112 113 public void remove() { 114 } 115 116 public GeniaParse parse(Element articleElement) { 117 GeniaParse parse = new GeniaParse(); 118 119 try { 120 StringWriter stringWriter = new StringWriter(); 121 new XMLOutputter().output(articleElement, stringWriter); 122 parse.setXml(stringWriter.toString()); 123 } catch (IOException ioe) { 124 throw new RuntimeException(ioe); 125 } 126 127 String medline = articleElement.getChild("articleinfo").getChild("bibliomisc").getText(); 128 medline = medline.split(":")[1]; 129 parse.setMedline(medline); 130 131 StringBuffer text = new StringBuffer(); 132 int offset = 0; 133 134 Element titleElement = articleElement.getChild("title"); 135 Element abstractElement = articleElement.getChild("abstract"); 136 137 if (titleElement != null) { 138 offset = parse(titleElement, parse, text, offset); 139 if (abstractElement != null) { 140 text.append("\n\n"); 141 offset += 2; 142 } 143 } 144 145 if (abstractElement != null) { 146 offset = parse(abstractElement, parse, text, offset); 147 } 148 149 parse.setText(text.toString()); 150 return parse; 151 } 152 153 private int parse(Element abstractElement, GeniaParse parse, StringBuffer text, int offset) { 154 List<GeniaTag> posTags = new ArrayList<GeniaTag>(); 155 List<GeniaTag> sentencePosTags = new ArrayList<GeniaTag>(); 156 157 List<GeniaTag> wildcardTags = new ArrayList<GeniaTag>(); 158 for (Element sentence : abstractElement.getChildren("sentence")) { 159 sentencePosTags.clear(); 160 wildcardTags.clear(); 161 int beginSentence = offset; 162 for (Content content : sentence.getContent()) { 163 if (content instanceof Text) { 164 Text contentText = (Text) content; 165 text.append(contentText.getText()); 166 offset += contentText.getText().length(); 167 168 } else if (content instanceof Element) { 169 Element wordElement = (Element) content; 170 if (!wordElement.getName().equals("w")) 171 throw new RuntimeException("non-word element in sentence: " + wordElement); 172 String wordText = wordElement.getText(); 173 text.append(wordText); 174 String pos = wordElement.getAttributeValue("c"); 175 if (pos.indexOf('|') != -1) 176 pos = pos.substring(0, pos.indexOf('|')); 177 GeniaTag posTag = new GeniaTag(pos, new Span(offset, offset + wordText.length())); 178 if (pos.equals("*")) { 179 wildcardTags.add(posTag); 180 } else { 181 if (wildcardTags.size() > 0) { 182 int start = wildcardTags.get(0).getSpans().get(0).getBegin(); 183 posTag = new GeniaTag(pos, new Span(start, offset + wordText.length())); 184 wildcardTags.clear(); 185 } 186 posTags.add(posTag); 187 sentencePosTags.add(posTag); 188 } 189 offset += wordText.length(); 190 } 191 } 192 193 int endSentence = offset; 194 Span sentenceSpan = new Span(beginSentence, endSentence); 195 GeniaSentence geniaSentence = new GeniaSentence(); 196 geniaSentence.setSpan(sentenceSpan); 197 geniaSentence.addPosTags(sentencePosTags); 198 parse.addSentence(geniaSentence); 199 200 text.append(" "); 201 offset += 2; 202 } 203 parse.addPosTags(posTags); 204 return offset; 205 } 206 207 public static void main(String[] args) { 208 try { 209 System.out.print("loading GENIA..."); 210 String xmlFileName = args[0]; 211 GeniaPosParser parser = new GeniaPosParser(new File(xmlFileName)); 212 System.out.println("done."); 213 Set<String> tags = new HashSet<String>(); 214 while (parser.hasNext()) { 215 GeniaParse parse = parser.next(); 216 for (GeniaTag posTag : parse.getPosTags()) { 217 tags.add(posTag.getLabel()); 218 } 219 } 220 List<String> sortedTags = new ArrayList<String>(tags); 221 Collections.sort(sortedTags); 222 System.out.println("number of tags=" + sortedTags.size()); 223 for (String tag : sortedTags) { 224 System.out.println(tag); 225 } 226 227 // if (parser.hasNext()) { 228 // GeniaParse parse = parser.next(); 229 // String text = parse.getText(); 230 // System.out.println("\n\n\n\ntext = " + text); 231 // for (GeniaTag posTag : parse.getPosTags()) { 232 // System.out.println(posTag.getLabel()); 233 // Span span = posTag.getSpans().get(0); 234 // System.out.println(text.substring(span.getBegin(), 235 // span.getEnd())); 236 // } 237 // for (GeniaSentence sentence : parse.getSentences()) { 238 // Span span = sentence.getSpan(); 239 // System.out.println(text.substring(span.getBegin(), 240 // span.getEnd())); 241 // } 242 // } 243 } catch (Exception e) { 244 e.printStackTrace(); 245 } 246 247 } 248}