001/* 002 * Copyright 2015 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.fcrepo.migration.foxml; 017 018import org.apache.commons.codec.binary.Base64OutputStream; 019import org.fcrepo.migration.ContentDigest; 020import org.fcrepo.migration.DatastreamInfo; 021import org.fcrepo.migration.DatastreamVersion; 022import org.fcrepo.migration.DefaultContentDigest; 023import org.fcrepo.migration.DefaultObjectInfo; 024import org.fcrepo.migration.FedoraObjectProcessor; 025import org.fcrepo.migration.ObjectInfo; 026import org.fcrepo.migration.ObjectProperties; 027import org.fcrepo.migration.ObjectReference; 028import org.fcrepo.migration.StreamingFedoraObjectHandler; 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import javax.xml.bind.JAXBContext; 033import javax.xml.bind.JAXBElement; 034import javax.xml.bind.JAXBException; 035import javax.xml.bind.Unmarshaller; 036import javax.xml.stream.XMLEventReader; 037import javax.xml.stream.XMLEventWriter; 038import javax.xml.stream.XMLInputFactory; 039import javax.xml.stream.XMLOutputFactory; 040import javax.xml.stream.XMLStreamConstants; 041import javax.xml.stream.XMLStreamException; 042import javax.xml.stream.XMLStreamReader; 043import javax.xml.stream.events.XMLEvent; 044import java.io.ByteArrayOutputStream; 045import java.io.File; 046import java.io.FileOutputStream; 047import java.io.IOException; 048import java.io.InputStream; 049import java.io.UnsupportedEncodingException; 050import java.net.MalformedURLException; 051import java.net.URL; 052import java.util.ArrayList; 053import java.util.Arrays; 054import java.util.HashMap; 055import java.util.HashSet; 056import java.util.List; 057import java.util.Map; 058import java.util.Set; 059 060/** 061 * A FedoraObjectProcessor implementation that uses the STaX API to process 062 * a FOXML XML InputStream. 063 * @author mdurbin 064 */ 065public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor { 066 067 private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class); 068 069 private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#"; 070 071 private URLFetcher fetcher; 072 073 private String localFedoraServer; 074 075 private InternalIDResolver idResolver; 076 077 private InputStream stream; 078 079 private XMLStreamReader reader; 080 081 private List<File> tempFiles; 082 083 boolean isFedora2 = false; 084 085 /** 086 * The basic object information read from the XML stream at construction 087 * time by processing the root XML element and its attributes. 088 */ 089 private ObjectInfo objectInfo; 090 091 /** 092 * foxml input stream fedora object processor. 093 * @param is the input stream 094 * @param fetcher the fetcher 095 * @param resolver the resolver 096 * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server 097 * from which the content exposed by the "is" parameter comes. 098 * @throws XMLStreamException xml stream exception 099 */ 100 public FoxmlInputStreamFedoraObjectProcessor(final InputStream is, final URLFetcher fetcher, 101 final InternalIDResolver resolver, final String localFedoraServer) 102 throws XMLStreamException { 103 this.fetcher = fetcher; 104 this.idResolver = resolver; 105 this.localFedoraServer = localFedoraServer; 106 final XMLInputFactory factory = XMLInputFactory.newFactory(); 107 stream = is; 108 reader = factory.createXMLStreamReader(is); 109 reader.nextTag(); 110 final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation"); 111 if (attributes.get("VERSION") == null || !attributes.get("VERSION").equals("1.1")) { 112 isFedora2 = true; 113 } 114 objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI")); 115 while (reader.next() == XMLStreamConstants.CHARACTERS) { 116 } 117 118 tempFiles = new ArrayList<File>(); 119 } 120 121 @Override 122 public ObjectInfo getObjectInfo() { 123 return objectInfo; 124 } 125 126 @Override 127 public void processObject(final StreamingFedoraObjectHandler handler) { 128 handler.beginObject(objectInfo); 129 Foxml11DatastreamInfo dsInfo = null; 130 try { 131 handler.processObjectProperties(readProperties()); 132 while (reader.hasNext()) { 133 if (reader.isCharacters()) { 134 if (!reader.isWhiteSpace()) { 135 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 136 } else { 137 // skip whitespace... 138 } 139 } else if (reader.isStartElement()) { 140 if (reader.getLocalName().equals("datastream") 141 && reader.getNamespaceURI().equals(FOXML_NS)) { 142 dsInfo = new Foxml11DatastreamInfo(objectInfo, reader); 143 } else if (reader.getLocalName().equals("datastreamVersion")) { 144 final DatastreamVersion v = new Foxml11DatastreamVersion(dsInfo, reader); 145 handler.processDatastreamVersion(v); 146 } else if (reader.getLocalName().equals("disseminator") && isFedora2) { 147 readUntilClosed("disseminator", FOXML_NS); 148 handler.processDisseminator(); 149 } else { 150 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 151 } 152 } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) { 153 dsInfo = null; 154 } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) { 155 // end of document.... 156 handler.completeObject(objectInfo); 157 cleanUpTempFiles(); 158 } else { 159 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 160 + reader.getLocation().getLineNumber() + ", column " 161 + reader.getLocation().getColumnNumber() 162 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 163 } 164 reader.next(); 165 } 166 167 } catch (Exception e) { 168 handler.abortObject(objectInfo); 169 if (e instanceof RuntimeException) { 170 throw (RuntimeException) e; 171 } 172 throw new RuntimeException(e); 173 } finally { 174 cleanUpTempFiles(); 175 close(); 176 } 177 } 178 179 /** 180 * Close resources associated to the processor 181 */ 182 public void close() { 183 try { 184 reader.close(); 185 } catch (final XMLStreamException e) { 186 LOG.warn("Failed to close reader cleanly", e); 187 } 188 try { 189 stream.close(); 190 } catch (IOException e) { 191 LOG.warn("Failed to close file cleanly", e); 192 } 193 } 194 195 private void cleanUpTempFiles() { 196 for (final File f : this.tempFiles) { 197 if (f.exists()) { 198 f.delete(); 199 } 200 } 201 } 202 203 private ObjectProperties readProperties() throws JAXBException, XMLStreamException { 204 final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class); 205 final Unmarshaller unmarshaller = jc.createUnmarshaller(); 206 final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class); 207 final FoxmlObjectProperties properties = p.getValue(); 208 if (isFedora2) { 209 // Fedora 2 uses the rdf:type property with a literal value to differentiate between 210 // objects, behavior mechanism objects and behavior definition objects. That literal 211 // cannot be retained as an rdf type in fedora4, nor can we use the generic mapping 212 // to map it, so we convert it to a dcterms:type right here. 213 for (FoxmlObjectProperty prop : properties.properties) { 214 if (prop.getName().equals("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")) { 215 prop.name = "http://purl.org/dc/terms/type"; 216 } 217 } 218 } 219 return properties; 220 } 221 222 private void readUntilClosed(final String name, final String namespace) throws XMLStreamException { 223 while (reader.hasNext()) { 224 if (reader.isEndElement() && reader.getLocalName().equals(name) 225 && reader.getNamespaceURI().equals(namespace)) { 226 return; 227 } else { 228 // skip all other stuff.... 229 } 230 reader.next(); 231 } 232 } 233 234 private class Foxml11DatastreamInfo implements DatastreamInfo { 235 236 private String id; 237 238 private String controlGroup; 239 240 private String fedoraUri; 241 242 private String state; 243 244 private boolean versionable; 245 246 private ObjectInfo objectInfo; 247 248 public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) { 249 this.objectInfo = objectInfo; 250 final Map<String, String> attributes 251 = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE"); 252 id = attributes.get("ID"); 253 controlGroup = attributes.get("CONTROL_GROUP"); 254 fedoraUri = attributes.get("FEDORA_URI"); 255 state = attributes.get("STATE"); 256 versionable = Boolean.valueOf(attributes.get("VERSIONABLE")); 257 } 258 259 @Override 260 public ObjectInfo getObjectInfo() { 261 return objectInfo; 262 } 263 264 @Override 265 public String getDatastreamId() { 266 return id; 267 } 268 269 @Override 270 public String getControlGroup() { 271 return controlGroup; 272 } 273 274 @Override 275 public String getFedoraURI() { 276 return fedoraUri; 277 } 278 279 @Override 280 public String getState() { 281 return state; 282 } 283 284 @Override 285 public boolean getVersionable() { 286 return versionable; 287 } 288 } 289 290 public class Foxml11DatastreamVersion implements DatastreamVersion { 291 292 private DatastreamInfo dsInfo; 293 294 private String id; 295 private String label; 296 private String created; 297 private String mimeType; 298 private String altIds; 299 private String formatUri; 300 private long size; 301 private ContentDigest contentDigest; 302 private CachedContent dsContent; 303 304 /** 305 * foxml datastream version. 306 * @param dsInfo the datastream information 307 * @param reader the reader 308 * @throws XMLStreamException xml stream exception 309 */ 310 public Foxml11DatastreamVersion(final DatastreamInfo dsInfo, 311 final XMLStreamReader reader) throws XMLStreamException { 312 this.dsInfo = dsInfo; 313 final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL", 314 "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE"); 315 id = dsAttributes.get("ID"); 316 label = dsAttributes.get("LABEL"); 317 created = dsAttributes.get("CREATED"); 318 mimeType = dsAttributes.get("MIMETYPE"); 319 altIds = dsAttributes.get("ALT_IDS"); 320 formatUri = dsAttributes.get("FORMAT_URI"); 321 size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1; 322 reader.next(); 323 324 while (reader.hasNext()) { 325 if (reader.isCharacters()) { 326 if (!reader.isWhiteSpace()) { 327 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 328 } else { 329 // skip whitespace... 330 } 331 } else if (reader.isStartElement()) { 332 final String localName = reader.getLocalName(); 333 if (localName.equals("contentDigest")) { 334 final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST"); 335 this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST")); 336 } else if (localName.equals("xmlContent")) { 337 // this XML fragment may not be valid out of context 338 // context, so write it out as a complete XML 339 // file... 340 reader.next(); 341 final ByteArrayOutputStream baos = new ByteArrayOutputStream(); 342 final XMLEventReader eventReader = XMLInputFactory.newFactory().createXMLEventReader(reader); 343 final XMLEventWriter eventWriter = XMLOutputFactory.newFactory().createXMLEventWriter(baos); 344 while (eventReader.hasNext()) { 345 final XMLEvent event = eventReader.nextEvent(); 346 if (event.isEndElement() 347 && event.asEndElement().getName().getLocalPart().equals("xmlContent") 348 && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) { 349 eventWriter.close(); 350 break; 351 } else { 352 eventWriter.add(event); 353 } 354 } 355 try { 356 dsContent = new MemoryCachedContent(new String(baos.toByteArray(), "UTF-8")); 357 } catch (final UnsupportedEncodingException e) { 358 throw new RuntimeException(e); 359 } 360 } else if (localName.equals("contentLocation")) { 361 final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE"); 362 if (attributes.get("TYPE").equals("INTERNAL_ID")) { 363 dsContent = idResolver.resolveInternalID(attributes.get("REF")); 364 } else { 365 try { 366 String ref = attributes.get("REF"); 367 if (ref.contains("local.fedora.server")) { 368 ref = ref.replace("local.fedora.server", localFedoraServer); 369 } 370 dsContent = new URLCachedContent(new URL(ref), fetcher); 371 } catch (final MalformedURLException e) { 372 throw new RuntimeException(e); 373 } 374 } 375 } else if (localName.equals("binaryContent")) { 376 try { 377 final File f = File.createTempFile("decoded", "file"); 378 tempFiles.add(f); 379 final Base64OutputStream out = new Base64OutputStream(new FileOutputStream(f), false); 380 while (reader.next() == XMLStreamConstants.CHARACTERS) { 381 out.write(reader.getText().getBytes("UTF-8")); 382 } 383 out.flush(); 384 out.close(); 385 dsContent = new FileCachedContent(f); 386 } catch (final IOException e) { 387 throw new RuntimeException(e); 388 } 389 readUntilClosed("binaryContent", FOXML_NS); 390 } else { 391 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 392 } 393 } else if (reader.isEndElement()) { 394 if (reader.getLocalName().equals("datastreamVersion")) { 395 return; 396 } 397 } else { 398 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 399 + reader.getLocation().getLineNumber() + ", column " 400 + reader.getLocation().getColumnNumber() 401 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 402 } 403 reader.next(); 404 } 405 406 } 407 408 @Override 409 public DatastreamInfo getDatastreamInfo() { 410 return dsInfo; 411 } 412 413 @Override 414 public String getVersionId() { 415 return id; 416 } 417 418 @Override 419 public String getMimeType() { 420 return mimeType; 421 } 422 423 @Override 424 public String getLabel() { 425 return label; 426 } 427 428 @Override 429 public String getCreated() { 430 return created; 431 } 432 433 @Override 434 public String getAltIds() { 435 return altIds; 436 } 437 438 @Override 439 public String getFormatUri() { 440 return formatUri; 441 } 442 443 @Override 444 public long getSize() { 445 return size; 446 } 447 448 @Override 449 public ContentDigest getContentDigest() { 450 return contentDigest; 451 } 452 453 @Override 454 public InputStream getContent() throws IOException { 455 return dsContent.getInputStream(); 456 } 457 458 @Override 459 public String getExternalOrRedirectURL() { 460 if (dsContent instanceof URLCachedContent) { 461 return ((URLCachedContent) dsContent).getURL().toString(); 462 } else { 463 throw new IllegalStateException(); 464 } 465 } 466 467 @Override 468 public boolean isFirstVersionIn(final ObjectReference obj) { 469 final List<DatastreamVersion> datastreams = 470 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 471 return datastreams.indexOf(this) == 0; 472 } 473 474 @Override 475 public boolean isLastVersionIn(final ObjectReference obj) { 476 final List<DatastreamVersion> datastreams = 477 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 478 return datastreams.indexOf(this) == datastreams.size() - 1; 479 } 480 } 481 482 private static Map<String, String> getAttributes(final XMLStreamReader r, 483 final String ... allowedNames) { 484 final HashMap<String, String> result = new HashMap<String, String>(); 485 final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames)); 486 for (int i = 0; i < r.getAttributeCount(); i ++) { 487 final String localName = r.getAttributeLocalName(i); 488 final String value = r.getAttributeValue(i); 489 if (allowed.contains(localName)) { 490 result.put(localName, value); 491 } else { 492 System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\""); 493 } 494 } 495 return result; 496 497 } 498 499}