001/* 002 * Copyright 2015 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.fcrepo.migration.foxml; 017 018import org.apache.commons.codec.binary.Base64OutputStream; 019import org.apache.commons.codec.binary.Hex; 020import org.apache.commons.codec.digest.DigestUtils; 021import org.apache.commons.io.FileUtils; 022import org.apache.commons.io.IOUtils; 023import org.apache.commons.lang3.StringUtils; 024import org.apache.xml.serialize.OutputFormat; 025import org.apache.xml.serialize.XMLSerializer; 026import org.codehaus.stax2.XMLInputFactory2; 027import org.fcrepo.migration.ContentDigest; 028import org.fcrepo.migration.DatastreamInfo; 029import org.fcrepo.migration.DatastreamVersion; 030import org.fcrepo.migration.DefaultContentDigest; 031import org.fcrepo.migration.DefaultObjectInfo; 032import org.fcrepo.migration.FedoraObjectProcessor; 033import org.fcrepo.migration.ObjectInfo; 034import org.fcrepo.migration.ObjectProperties; 035import org.fcrepo.migration.ObjectReference; 036import org.fcrepo.migration.StreamingFedoraObjectHandler; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039import org.w3c.dom.Document; 040import org.xml.sax.InputSource; 041import org.xml.sax.SAXException; 042 043import javax.xml.bind.JAXBContext; 044import javax.xml.bind.JAXBElement; 045import javax.xml.bind.JAXBException; 046import javax.xml.bind.Unmarshaller; 047import javax.xml.parsers.DocumentBuilder; 048import javax.xml.parsers.DocumentBuilderFactory; 049import javax.xml.parsers.ParserConfigurationException; 050import javax.xml.stream.XMLEventReader; 051import javax.xml.stream.XMLInputFactory; 052import javax.xml.stream.XMLStreamConstants; 053import javax.xml.stream.XMLStreamException; 054import javax.xml.stream.XMLStreamReader; 055import javax.xml.stream.events.XMLEvent; 056import java.io.BufferedInputStream; 057import java.io.BufferedReader; 058import java.io.ByteArrayOutputStream; 059import java.io.File; 060import java.io.FileInputStream; 061import java.io.FileNotFoundException; 062import java.io.FileOutputStream; 063import java.io.IOException; 064import java.io.InputStream; 065import java.io.InputStreamReader; 066import java.io.OutputStreamWriter; 067import java.io.PrintWriter; 068import java.io.StringReader; 069import java.io.StringWriter; 070import java.io.UncheckedIOException; 071import java.net.MalformedURLException; 072import java.net.URL; 073import java.nio.charset.StandardCharsets; 074import java.util.ArrayList; 075import java.util.Arrays; 076import java.util.HashMap; 077import java.util.HashSet; 078import java.util.LinkedList; 079import java.util.List; 080import java.util.Map; 081import java.util.Set; 082import java.util.regex.Pattern; 083 084/** 085 * A FedoraObjectProcessor implementation that uses the STaX API to process 086 * a FOXML XML InputStream. 087 * @author mdurbin 088 */ 089public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor { 090 091 private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class); 092 093 private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>", 094 Pattern.DOTALL); 095 096 private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#"; 097 098 private URLFetcher fetcher; 099 100 private String localFedoraServer; 101 102 private InternalIDResolver idResolver; 103 104 private File file; 105 106 private InputStream stream; 107 108 private XMLStreamReader reader; 109 110 private DocumentBuilder documentBuilder; 111 112 private List<File> tempFiles; 113 114 private LinkedList<String> inlineXml; 115 116 /** 117 * The basic object information read from the XML stream at construction 118 * time by processing the root XML element and its attributes. 119 */ 120 private ObjectInfo objectInfo; 121 122 /** 123 * foxml input stream fedora object processor. 124 * @param file the FOXML file 125 * @param fetcher the fetcher 126 * @param resolver the resolver 127 * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server 128 * from which the content exposed by the "is" parameter comes. 129 * @throws XMLStreamException xml stream exception 130 */ 131 public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher, 132 final InternalIDResolver resolver, final String localFedoraServer) 133 throws XMLStreamException, FileNotFoundException { 134 this.file = file; 135 this.fetcher = fetcher; 136 this.idResolver = resolver; 137 this.localFedoraServer = localFedoraServer; 138 final XMLInputFactory factory = XMLInputFactory.newFactory(); 139 stream = new BufferedInputStream(new FileInputStream(file)); 140 reader = factory.createXMLStreamReader(stream); 141 reader.nextTag(); 142 final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation"); 143 objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath()); 144 while (reader.next() == XMLStreamConstants.CHARACTERS) { 145 } 146 147 tempFiles = new ArrayList<File>(); 148 149 final var builderFactory = DocumentBuilderFactory.newInstance(); 150 builderFactory.setNamespaceAware(true); 151 builderFactory.setIgnoringComments(false); 152 try { 153 documentBuilder = builderFactory.newDocumentBuilder(); 154 } catch (ParserConfigurationException e) { 155 throw new RuntimeException(e); 156 } 157 158 try { 159 inlineXml = new LinkedList<>(); 160 final var content = FileUtils.readFileToString(file); 161 final var matcher = INLINE_PATTERN.matcher(content); 162 while (matcher.find()) { 163 inlineXml.add(matcher.group(1)); 164 } 165 } catch (IOException e) { 166 throw new UncheckedIOException(e); 167 } 168 } 169 170 @Override 171 public ObjectInfo getObjectInfo() { 172 return objectInfo; 173 } 174 175 @Override 176 public void processObject(final StreamingFedoraObjectHandler handler) { 177 handler.beginObject(objectInfo); 178 Foxml11DatastreamInfo dsInfo = null; 179 try { 180 handler.processObjectProperties(readProperties()); 181 while (reader.hasNext()) { 182 if (reader.isCharacters()) { 183 if (!reader.isWhiteSpace()) { 184 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 185 } else { 186 // skip whitespace... 187 } 188 } else if (reader.isStartElement()) { 189 if (reader.getLocalName().equals("datastream") 190 && reader.getNamespaceURI().equals(FOXML_NS)) { 191 dsInfo = new Foxml11DatastreamInfo(objectInfo, reader); 192 } else if (reader.getLocalName().equals("datastreamVersion")) { 193 final var v = new Foxml11DatastreamVersion(dsInfo, reader); 194 v.validateInlineXml(); 195 handler.processDatastreamVersion(v); 196 } else { 197 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 198 } 199 } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) { 200 dsInfo = null; 201 } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) { 202 // end of document.... 203 handler.completeObject(objectInfo); 204 cleanUpTempFiles(); 205 } else { 206 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 207 + reader.getLocation().getLineNumber() + ", column " 208 + reader.getLocation().getColumnNumber() 209 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 210 } 211 reader.next(); 212 } 213 214 } catch (Exception e) { 215 handler.abortObject(objectInfo); 216 if (e instanceof RuntimeException) { 217 throw (RuntimeException) e; 218 } 219 throw new RuntimeException(e); 220 } finally { 221 cleanUpTempFiles(); 222 close(); 223 } 224 } 225 226 /** 227 * Close resources associated to the processor 228 */ 229 public void close() { 230 try { 231 reader.close(); 232 } catch (final XMLStreamException e) { 233 LOG.warn("Failed to close reader cleanly", e); 234 } 235 try { 236 stream.close(); 237 } catch (IOException e) { 238 LOG.warn("Failed to close file cleanly", e); 239 } 240 } 241 242 private void cleanUpTempFiles() { 243 for (final File f : this.tempFiles) { 244 if (f.exists()) { 245 f.delete(); 246 } 247 } 248 } 249 250 private ObjectProperties readProperties() throws JAXBException, XMLStreamException { 251 final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class); 252 final Unmarshaller unmarshaller = jc.createUnmarshaller(); 253 final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class); 254 final FoxmlObjectProperties properties = p.getValue(); 255 return properties; 256 } 257 258 private void readUntilClosed(final String name, final String namespace) throws XMLStreamException { 259 while (reader.hasNext()) { 260 if (reader.isEndElement() && reader.getLocalName().equals(name) 261 && reader.getNamespaceURI().equals(namespace)) { 262 return; 263 } else { 264 // skip all other stuff.... 265 } 266 reader.next(); 267 } 268 } 269 270 private class Foxml11DatastreamInfo implements DatastreamInfo { 271 272 private String id; 273 274 private String controlGroup; 275 276 private String fedoraUri; 277 278 private String state; 279 280 private boolean versionable; 281 282 private ObjectInfo objectInfo; 283 284 public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) { 285 this.objectInfo = objectInfo; 286 final Map<String, String> attributes 287 = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE"); 288 id = attributes.get("ID"); 289 controlGroup = attributes.get("CONTROL_GROUP"); 290 fedoraUri = attributes.get("FEDORA_URI"); 291 state = attributes.get("STATE"); 292 versionable = Boolean.valueOf(attributes.get("VERSIONABLE")); 293 } 294 295 @Override 296 public ObjectInfo getObjectInfo() { 297 return objectInfo; 298 } 299 300 @Override 301 public String getDatastreamId() { 302 return id; 303 } 304 305 @Override 306 public String getControlGroup() { 307 return controlGroup; 308 } 309 310 @Override 311 public String getFedoraURI() { 312 return fedoraUri; 313 } 314 315 @Override 316 public String getState() { 317 return state; 318 } 319 320 @Override 321 public boolean getVersionable() { 322 return versionable; 323 } 324 } 325 326 public class Foxml11DatastreamVersion implements DatastreamVersion { 327 328 private DatastreamInfo dsInfo; 329 330 private String id; 331 private String label; 332 private String created; 333 private String mimeType; 334 private String altIds; 335 private String formatUri; 336 private long size; 337 private ContentDigest contentDigest; 338 private CachedContent dsContent; 339 private boolean isInlineXml = false; 340 341 /** 342 * foxml datastream version. 343 * @param dsInfo the datastream information 344 * @param reader the reader 345 * @throws XMLStreamException xml stream exception 346 */ 347 public Foxml11DatastreamVersion(final DatastreamInfo dsInfo, 348 final XMLStreamReader reader) throws XMLStreamException { 349 this.dsInfo = dsInfo; 350 final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL", 351 "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE"); 352 id = dsAttributes.get("ID"); 353 label = dsAttributes.get("LABEL"); 354 created = dsAttributes.get("CREATED"); 355 mimeType = dsAttributes.get("MIMETYPE"); 356 altIds = dsAttributes.get("ALT_IDS"); 357 formatUri = dsAttributes.get("FORMAT_URI"); 358 size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1; 359 reader.next(); 360 361 while (reader.hasNext()) { 362 if (reader.isCharacters()) { 363 if (!reader.isWhiteSpace()) { 364 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 365 } else { 366 // skip whitespace... 367 } 368 } else if (reader.isStartElement()) { 369 final String localName = reader.getLocalName(); 370 if (localName.equals("contentDigest")) { 371 final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST"); 372 this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST")); 373 } else if (localName.equals("xmlContent")) { 374 // this XML fragment may not be valid out of context 375 // context, so write it out as a complete XML 376 // file... 377 reader.next(); 378 379 isInlineXml = true; 380 dsContent = new MemoryCachedContent(extractInlineXml()); 381 } else if (localName.equals("contentLocation")) { 382 final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE"); 383 if (attributes.get("TYPE").equals("INTERNAL_ID")) { 384 dsContent = idResolver.resolveInternalID(attributes.get("REF")); 385 } else { 386 try { 387 String ref = attributes.get("REF"); 388 if (ref.contains("local.fedora.server")) { 389 ref = ref.replace("local.fedora.server", localFedoraServer); 390 } 391 dsContent = new URLCachedContent(new URL(ref), fetcher); 392 } catch (final MalformedURLException e) { 393 throw new RuntimeException(e); 394 } 395 } 396 } else if (localName.equals("binaryContent")) { 397 try { 398 final File f = File.createTempFile("decoded", "file"); 399 tempFiles.add(f); 400 final Base64OutputStream out = new Base64OutputStream(new FileOutputStream(f), false); 401 while (reader.next() == XMLStreamConstants.CHARACTERS) { 402 out.write(reader.getText().getBytes("UTF-8")); 403 } 404 out.flush(); 405 out.close(); 406 dsContent = new FileCachedContent(f); 407 } catch (final IOException e) { 408 throw new RuntimeException(e); 409 } 410 readUntilClosed("binaryContent", FOXML_NS); 411 } else { 412 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 413 } 414 } else if (reader.isEndElement()) { 415 if (reader.getLocalName().equals("datastreamVersion")) { 416 return; 417 } 418 } else { 419 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 420 + reader.getLocation().getLineNumber() + ", column " 421 + reader.getLocation().getColumnNumber() 422 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 423 } 424 reader.next(); 425 } 426 427 } 428 429 private String extractInlineXml() throws XMLStreamException { 430 final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader); 431 while (eventReader.hasNext()) { 432 final XMLEvent event = eventReader.nextEvent(); 433 if (event.isEndElement() 434 && event.asEndElement().getName().getLocalPart().equals("xmlContent") 435 && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) { 436 break; 437 } 438 } 439 440 return inlineXml.removeFirst(); 441 } 442 443 private void validateInlineXml() { 444 if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) { 445 final var transformedXml = transformInlineXmlForChecksum(); 446 final var digest = DigestUtils.getDigest(contentDigest.getType()); 447 final var digestBytes = DigestUtils.digest(digest, transformedXml); 448 final var digestHex = Hex.encodeHexString(digestBytes); 449 450 if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) { 451 throw new RuntimeException(String.format( 452 "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s", 453 dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(), 454 contentDigest.getType(), contentDigest.getDigest(), digestHex)); 455 } 456 } 457 } 458 459 /** 460 * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/ 461 * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/ 462 * DatastreamXMLMetadata.java#L92 463 * 464 * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order. 465 * 466 * @return the xml in the format Fedora 3 used to calculate digests 467 */ 468 private byte[] transformInlineXmlForChecksum() { 469 try { 470 // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :( 471 final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" 472 + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8); 473 474 final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8); 475 final var source = new InputSource(isReader); 476 source.setEncoding("UTF-8"); 477 478 final Document doc = documentBuilder.parse(source); 479 480 final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false); 481 // indent == 0 means add no indenting 482 fmt.setIndent(0); 483 // default line width is 72, but only applies when indenting 484 fmt.setLineWidth(0); 485 fmt.setPreserveSpace(false); 486 487 final StringWriter out = new StringWriter(); 488 final XMLSerializer ser = new XMLSerializer(out, fmt); 489 ser.serialize(doc); 490 out.close(); 491 492 final var baos = new ByteArrayOutputStream(); 493 final var br = new BufferedReader(new StringReader(out.toString())); 494 String line; 495 final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8)); 496 while ((line = br.readLine()) != null) { 497 line = line.trim(); 498 outStream.append(line); 499 } 500 outStream.close(); 501 502 return baos.toByteArray(); 503 } catch (IOException e) { 504 throw new UncheckedIOException(e); 505 } catch (SAXException e) { 506 try { 507 LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream())); 508 } catch (IOException e2) { 509 // swallow 510 } 511 throw new RuntimeException(e); 512 } 513 } 514 515 @Override 516 public DatastreamInfo getDatastreamInfo() { 517 return dsInfo; 518 } 519 520 @Override 521 public String getVersionId() { 522 return id; 523 } 524 525 @Override 526 public String getMimeType() { 527 return mimeType; 528 } 529 530 @Override 531 public String getLabel() { 532 return label; 533 } 534 535 @Override 536 public String getCreated() { 537 return created; 538 } 539 540 @Override 541 public String getAltIds() { 542 return altIds; 543 } 544 545 @Override 546 public String getFormatUri() { 547 return formatUri; 548 } 549 550 @Override 551 public long getSize() { 552 return size; 553 } 554 555 @Override 556 public ContentDigest getContentDigest() { 557 // The digests for inline xml do not match what is stored in the FOXML and should not be returned here. 558 if (isInlineXml) { 559 return null; 560 } 561 return contentDigest; 562 } 563 564 @Override 565 public InputStream getContent() throws IOException { 566 return dsContent.getInputStream(); 567 } 568 569 @Override 570 public String getExternalOrRedirectURL() { 571 if (dsContent instanceof URLCachedContent) { 572 return ((URLCachedContent) dsContent).getURL().toString(); 573 } else { 574 throw new IllegalStateException(); 575 } 576 } 577 578 @Override 579 public boolean isFirstVersionIn(final ObjectReference obj) { 580 final List<DatastreamVersion> datastreams = 581 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 582 return datastreams.indexOf(this) == 0; 583 } 584 585 @Override 586 public boolean isLastVersionIn(final ObjectReference obj) { 587 final List<DatastreamVersion> datastreams = 588 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 589 return datastreams.indexOf(this) == datastreams.size() - 1; 590 } 591 } 592 593 private static Map<String, String> getAttributes(final XMLStreamReader r, 594 final String ... allowedNames) { 595 final HashMap<String, String> result = new HashMap<String, String>(); 596 final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames)); 597 for (int i = 0; i < r.getAttributeCount(); i ++) { 598 final String localName = r.getAttributeLocalName(i); 599 final String value = r.getAttributeValue(i); 600 if (allowed.contains(localName)) { 601 result.put(localName, value); 602 } else { 603 System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\""); 604 } 605 } 606 return result; 607 608 } 609 610}