001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.genia.pos.util;
025
026import java.io.File;
027import java.io.IOException;
028import java.io.StringWriter;
029import java.util.ArrayList;
030import java.util.Collections;
031import java.util.HashSet;
032import java.util.Iterator;
033import java.util.List;
034import java.util.Set;
035
036import org.jdom2.Content;
037import org.jdom2.Element;
038import org.jdom2.JDOMException;
039import org.jdom2.Text;
040import org.jdom2.input.SAXBuilder;
041import org.jdom2.output.XMLOutputter;
042
043/**
044 * <br>
045 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
046 * All rights reserved.
047 * 
048 * <p>
049 * 
050 * 
051 * This class parses the file GENIAcorpus3.02.pos.xml which provides sentence, word, and
052 * part-of-speech data. This parser maintains the whitespace found in the xml file so that the text
053 * added to the CAS does not come out as:
054 * <p>
055 * <code>"... of anti- Ro(SSA) antibodies . A pair of restriction "</code>
056 * <p>
057 * but instead comes out as:
058 * <p>
059 * <code>"... of anti-Ro(SSA) antibodies.  A pair of restriction "</code>
060 * <p>
061 * 
062 * There is no white space provided between sentences provided by the genia corpus. So, this parser
063 * simply adds in two spaces between each sentence. It also adds two newlines between the title and
064 * the body of the abstract.
065 * <p>
066 * The parses returned by this parser will not have any named entities - i.e. there will be now
067 * values returned from GeniaParse.getSemTags().
068 * 
069 * <p>
070 * About 4000 word (w) tags have a part-of-speech assignment "*" which I refer to as the wildcard
071 * part-of-speech tag. An example is:
072 * 
073 * <pre>
074 *      &lt;w c=&quot;*&quot;&gt;Ras&lt;/w&gt;&lt;w c=&quot;NN&quot;&gt;/protein&lt;/w&gt;
075 * </pre>
076 * 
077 * The above tags are parsed as a single token Ras/protein with the tag "NN".
078 * 
079 * @author Philip V. Ogren
080 */
081
082public class GeniaPosParser implements Iterator<GeniaParse> {
083
084  Element root;
085
086  Iterator<?> articles;
087
088  Set<String> posLabels;
089
090  XMLOutputter outputter;
091
092  public GeniaPosParser(File xmlFile) throws IOException, JDOMException {
093    this();
094    SAXBuilder builder = new SAXBuilder();
095    builder.setDTDHandler(null);
096    root = builder.build(xmlFile).getRootElement();
097    articles = root.getChildren("article").iterator();
098    outputter = new XMLOutputter();
099  }
100
101  public GeniaPosParser() {
102    posLabels = new HashSet<String>();
103  }
104
105  public boolean hasNext() {
106    return articles.hasNext();
107  }
108
109  public GeniaParse next() {
110    return parse((Element) articles.next());
111  }
112
113  public void remove() {
114  }
115
116  public GeniaParse parse(Element articleElement) {
117    GeniaParse parse = new GeniaParse();
118
119    try {
120      StringWriter stringWriter = new StringWriter();
121      new XMLOutputter().output(articleElement, stringWriter);
122      parse.setXml(stringWriter.toString());
123    } catch (IOException ioe) {
124      throw new RuntimeException(ioe);
125    }
126
127    String medline = articleElement.getChild("articleinfo").getChild("bibliomisc").getText();
128    medline = medline.split(":")[1];
129    parse.setMedline(medline);
130
131    StringBuffer text = new StringBuffer();
132    int offset = 0;
133
134    Element titleElement = articleElement.getChild("title");
135    Element abstractElement = articleElement.getChild("abstract");
136
137    if (titleElement != null) {
138      offset = parse(titleElement, parse, text, offset);
139      if (abstractElement != null) {
140        text.append("\n\n");
141        offset += 2;
142      }
143    }
144
145    if (abstractElement != null) {
146      offset = parse(abstractElement, parse, text, offset);
147    }
148
149    parse.setText(text.toString());
150    return parse;
151  }
152
153  private int parse(Element abstractElement, GeniaParse parse, StringBuffer text, int offset) {
154    List<GeniaTag> posTags = new ArrayList<GeniaTag>();
155    List<GeniaTag> sentencePosTags = new ArrayList<GeniaTag>();
156
157    List<GeniaTag> wildcardTags = new ArrayList<GeniaTag>();
158    for (Element sentence : abstractElement.getChildren("sentence")) {
159      sentencePosTags.clear();
160      wildcardTags.clear();
161      int beginSentence = offset;
162      for (Content content : sentence.getContent()) {
163        if (content instanceof Text) {
164          Text contentText = (Text) content;
165          text.append(contentText.getText());
166          offset += contentText.getText().length();
167
168        } else if (content instanceof Element) {
169          Element wordElement = (Element) content;
170          if (!wordElement.getName().equals("w"))
171            throw new RuntimeException("non-word element in sentence: " + wordElement);
172          String wordText = wordElement.getText();
173          text.append(wordText);
174          String pos = wordElement.getAttributeValue("c");
175          if (pos.indexOf('|') != -1)
176            pos = pos.substring(0, pos.indexOf('|'));
177          GeniaTag posTag = new GeniaTag(pos, new Span(offset, offset + wordText.length()));
178          if (pos.equals("*")) {
179            wildcardTags.add(posTag);
180          } else {
181            if (wildcardTags.size() > 0) {
182              int start = wildcardTags.get(0).getSpans().get(0).getBegin();
183              posTag = new GeniaTag(pos, new Span(start, offset + wordText.length()));
184              wildcardTags.clear();
185            }
186            posTags.add(posTag);
187            sentencePosTags.add(posTag);
188          }
189          offset += wordText.length();
190        }
191      }
192
193      int endSentence = offset;
194      Span sentenceSpan = new Span(beginSentence, endSentence);
195      GeniaSentence geniaSentence = new GeniaSentence();
196      geniaSentence.setSpan(sentenceSpan);
197      geniaSentence.addPosTags(sentencePosTags);
198      parse.addSentence(geniaSentence);
199
200      text.append("  ");
201      offset += 2;
202    }
203    parse.addPosTags(posTags);
204    return offset;
205  }
206
207  public static void main(String[] args) {
208    try {
209      System.out.print("loading GENIA...");
210      String xmlFileName = args[0];
211      GeniaPosParser parser = new GeniaPosParser(new File(xmlFileName));
212      System.out.println("done.");
213      Set<String> tags = new HashSet<String>();
214      while (parser.hasNext()) {
215        GeniaParse parse = parser.next();
216        for (GeniaTag posTag : parse.getPosTags()) {
217          tags.add(posTag.getLabel());
218        }
219      }
220      List<String> sortedTags = new ArrayList<String>(tags);
221      Collections.sort(sortedTags);
222      System.out.println("number of tags=" + sortedTags.size());
223      for (String tag : sortedTags) {
224        System.out.println(tag);
225      }
226
227      // if (parser.hasNext()) {
228      // GeniaParse parse = parser.next();
229      // String text = parse.getText();
230      // System.out.println("\n\n\n\ntext = " + text);
231      // for (GeniaTag posTag : parse.getPosTags()) {
232      // System.out.println(posTag.getLabel());
233      // Span span = posTag.getSpans().get(0);
234      // System.out.println(text.substring(span.getBegin(),
235      // span.getEnd()));
236      // }
237      // for (GeniaSentence sentence : parse.getSentences()) {
238      // Span span = sentence.getSpan();
239      // System.out.println(text.substring(span.getBegin(),
240      // span.getEnd()));
241      // }
242      // }
243    } catch (Exception e) {
244      e.printStackTrace();
245    }
246
247  }
248}