001/** 002 * The contents of this file are subject to the license and copyright 003 * detailed in the LICENSE and NOTICE files at the root of the source 004 * tree. 005 * 006 */ 007package org.fcrepo.migration.foxml; 008 009import io.micrometer.core.instrument.Metrics; 010import io.micrometer.core.instrument.Timer; 011import org.apache.commons.codec.binary.Base64OutputStream; 012import org.apache.commons.codec.binary.Hex; 013import org.apache.commons.codec.digest.DigestUtils; 014import org.apache.commons.io.FileUtils; 015import org.apache.commons.io.IOUtils; 016import org.apache.commons.lang3.StringUtils; 017import org.apache.xml.serialize.OutputFormat; 018import org.apache.xml.serialize.XMLSerializer; 019import org.codehaus.stax2.XMLInputFactory2; 020import org.fcrepo.migration.ContentDigest; 021import org.fcrepo.migration.DatastreamInfo; 022import org.fcrepo.migration.DatastreamVersion; 023import org.fcrepo.migration.DefaultContentDigest; 024import org.fcrepo.migration.DefaultObjectInfo; 025import org.fcrepo.migration.FedoraObjectProcessor; 026import org.fcrepo.migration.ObjectInfo; 027import org.fcrepo.migration.ObjectProperties; 028import org.fcrepo.migration.ObjectReference; 029import org.fcrepo.migration.StreamingFedoraObjectHandler; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032import org.w3c.dom.Document; 033import org.xml.sax.InputSource; 034import org.xml.sax.SAXException; 035 036import javax.xml.bind.JAXBContext; 037import javax.xml.bind.JAXBElement; 038import javax.xml.bind.JAXBException; 039import javax.xml.bind.Unmarshaller; 040import javax.xml.parsers.DocumentBuilder; 041import javax.xml.parsers.DocumentBuilderFactory; 042import javax.xml.parsers.ParserConfigurationException; 043import javax.xml.stream.XMLEventReader; 044import javax.xml.stream.XMLInputFactory; 045import javax.xml.stream.XMLStreamConstants; 046import javax.xml.stream.XMLStreamException; 047import javax.xml.stream.XMLStreamReader; 048import javax.xml.stream.events.XMLEvent; 049import java.io.BufferedInputStream; 050import java.io.BufferedOutputStream; 051import java.io.BufferedReader; 052import java.io.ByteArrayOutputStream; 053import java.io.File; 054import java.io.FileInputStream; 055import java.io.FileNotFoundException; 056import java.io.FileOutputStream; 057import java.io.IOException; 058import java.io.InputStream; 059import java.io.InputStreamReader; 060import java.io.OutputStreamWriter; 061import java.io.PrintWriter; 062import java.io.StringReader; 063import java.io.StringWriter; 064import java.io.UncheckedIOException; 065import java.net.MalformedURLException; 066import java.net.URL; 067import java.nio.charset.StandardCharsets; 068import java.time.Instant; 069import java.util.ArrayList; 070import java.util.Arrays; 071import java.util.HashMap; 072import java.util.HashSet; 073import java.util.LinkedList; 074import java.util.List; 075import java.util.Map; 076import java.util.Optional; 077import java.util.Set; 078import java.util.regex.Pattern; 079 080/** 081 * A FedoraObjectProcessor implementation that uses the STaX API to process 082 * a FOXML XML InputStream. 083 * @author mdurbin 084 */ 085public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor { 086 087 private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class); 088 089 private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>", 090 Pattern.DOTALL); 091 092 private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#"; 093 094 private static final String METRIC_NAME = "fcrepo.storage.foxml.object"; 095 private static final String OPERATION = "operation"; 096 private static final Timer processObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "processObject"); 097 private static final Timer completeObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "completeObject"); 098 099 private URLFetcher fetcher; 100 101 private String localFedoraServer; 102 103 private InternalIDResolver idResolver; 104 105 private File file; 106 107 private InputStream stream; 108 109 private XMLStreamReader reader; 110 111 private DocumentBuilder documentBuilder; 112 113 private List<File> tempFiles; 114 115 private LinkedList<String> inlineXml; 116 117 /** 118 * The basic object information read from the XML stream at construction 119 * time by processing the root XML element and its attributes. 120 */ 121 private ObjectInfo objectInfo; 122 123 /** 124 * foxml input stream fedora object processor. 125 * @param file the FOXML file 126 * @param fetcher the fetcher 127 * @param resolver the resolver 128 * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server 129 * from which the content exposed by the "is" parameter comes. 130 * @throws XMLStreamException xml stream exception 131 */ 132 public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher, 133 final InternalIDResolver resolver, final String localFedoraServer) 134 throws XMLStreamException, FileNotFoundException { 135 this.file = file; 136 this.fetcher = fetcher; 137 this.idResolver = resolver; 138 this.localFedoraServer = localFedoraServer; 139 final XMLInputFactory factory = XMLInputFactory.newFactory(); 140 stream = new BufferedInputStream(new FileInputStream(file)); 141 reader = factory.createXMLStreamReader(stream); 142 reader.nextTag(); 143 final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation"); 144 objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath()); 145 while (reader.next() == XMLStreamConstants.CHARACTERS) { 146 } 147 148 tempFiles = new ArrayList<File>(); 149 150 final var builderFactory = DocumentBuilderFactory.newInstance(); 151 builderFactory.setNamespaceAware(true); 152 builderFactory.setIgnoringComments(false); 153 try { 154 documentBuilder = builderFactory.newDocumentBuilder(); 155 } catch (ParserConfigurationException e) { 156 throw new RuntimeException(e); 157 } 158 159 try { 160 inlineXml = new LinkedList<>(); 161 final var content = FileUtils.readFileToString(file); 162 final var matcher = INLINE_PATTERN.matcher(content); 163 while (matcher.find()) { 164 inlineXml.add(matcher.group(1)); 165 } 166 } catch (IOException e) { 167 throw new UncheckedIOException(e); 168 } 169 } 170 171 @Override 172 public ObjectInfo getObjectInfo() { 173 return objectInfo; 174 } 175 176 @Override 177 public void processObject(final StreamingFedoraObjectHandler handler) { 178 final var stopwatch = Timer.start(); 179 handler.beginObject(objectInfo); 180 Foxml11DatastreamInfo dsInfo = null; 181 try { 182 handler.processObjectProperties(readProperties()); 183 while (reader.hasNext()) { 184 if (reader.isCharacters()) { 185 if (!reader.isWhiteSpace()) { 186 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 187 } else { 188 // skip whitespace... 189 } 190 } else if (reader.isStartElement()) { 191 if (reader.getLocalName().equals("datastream") 192 && reader.getNamespaceURI().equals(FOXML_NS)) { 193 dsInfo = new Foxml11DatastreamInfo(objectInfo, reader); 194 } else if (reader.getLocalName().equals("datastreamVersion")) { 195 final var v = new Foxml11DatastreamVersion(dsInfo, reader); 196 try { 197 v.validateInlineXml(); 198 } catch (RuntimeException e) { 199 // do we need to do anyting with disabled digests? 200 LOG.error("Inline Validation failed", e); 201 throw new RuntimeException(e); 202 } 203 handler.processDatastreamVersion(v); 204 } else { 205 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 206 } 207 } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) { 208 dsInfo = null; 209 } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) { 210 // end of document.... 211 } else { 212 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 213 + reader.getLocation().getLineNumber() + ", column " 214 + reader.getLocation().getColumnNumber() 215 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 216 } 217 reader.next(); 218 } 219 } catch (Exception e) { 220 abort(handler, e); 221 } finally { 222 stopwatch.stop(processObjectTimer); 223 } 224 225 completeObjectTimer.record(() -> complete(handler)); 226 } 227 228 private void complete(final StreamingFedoraObjectHandler handler) { 229 try { 230 handler.completeObject(objectInfo); 231 cleanUpTempFiles(); 232 } catch (Exception e) { 233 abort(handler, e); 234 } 235 } 236 237 private void abort(final StreamingFedoraObjectHandler handler, final Exception e) { 238 try { 239 handler.abortObject(objectInfo); 240 if (e instanceof RuntimeException) { 241 throw (RuntimeException) e; 242 } 243 throw new RuntimeException(e); 244 } finally { 245 cleanUpTempFiles(); 246 close(); 247 } 248 } 249 250 /** 251 * Close resources associated to the processor 252 */ 253 public void close() { 254 try { 255 reader.close(); 256 } catch (final XMLStreamException e) { 257 LOG.warn("Failed to close reader cleanly", e); 258 } 259 try { 260 stream.close(); 261 } catch (IOException e) { 262 LOG.warn("Failed to close file cleanly", e); 263 } 264 } 265 266 private void cleanUpTempFiles() { 267 for (final File f : this.tempFiles) { 268 if (f.exists()) { 269 f.delete(); 270 } 271 } 272 } 273 274 private ObjectProperties readProperties() throws JAXBException, XMLStreamException { 275 final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class); 276 final Unmarshaller unmarshaller = jc.createUnmarshaller(); 277 final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class); 278 final FoxmlObjectProperties properties = p.getValue(); 279 return properties; 280 } 281 282 private void readUntilClosed(final String name, final String namespace) throws XMLStreamException { 283 while (reader.hasNext()) { 284 if (reader.isEndElement() && reader.getLocalName().equals(name) 285 && reader.getNamespaceURI().equals(namespace)) { 286 return; 287 } else { 288 // skip all other stuff.... 289 } 290 reader.next(); 291 } 292 } 293 294 private class Foxml11DatastreamInfo implements DatastreamInfo { 295 296 private String id; 297 298 private String controlGroup; 299 300 private String fedoraUri; 301 302 private String state; 303 304 private boolean versionable; 305 306 private ObjectInfo objectInfo; 307 308 public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) { 309 this.objectInfo = objectInfo; 310 final Map<String, String> attributes 311 = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE"); 312 id = attributes.get("ID"); 313 controlGroup = attributes.get("CONTROL_GROUP"); 314 fedoraUri = attributes.get("FEDORA_URI"); 315 state = attributes.get("STATE"); 316 versionable = Boolean.valueOf(attributes.get("VERSIONABLE")); 317 } 318 319 @Override 320 public ObjectInfo getObjectInfo() { 321 return objectInfo; 322 } 323 324 @Override 325 public String getDatastreamId() { 326 return id; 327 } 328 329 @Override 330 public String getControlGroup() { 331 return controlGroup; 332 } 333 334 @Override 335 public String getFedoraURI() { 336 return fedoraUri; 337 } 338 339 @Override 340 public String getState() { 341 return state; 342 } 343 344 @Override 345 public boolean getVersionable() { 346 return versionable; 347 } 348 } 349 350 public class Foxml11DatastreamVersion implements DatastreamVersion { 351 352 private DatastreamInfo dsInfo; 353 354 private String id; 355 private String label; 356 private String created; 357 private Instant createdInstant; 358 private String mimeType; 359 private String altIds; 360 private String formatUri; 361 private long size; 362 private ContentDigest contentDigest; 363 private CachedContent dsContent; 364 private boolean isInlineXml = false; 365 366 /** 367 * foxml datastream version. 368 * @param dsInfo the datastream information 369 * @param reader the reader 370 * @throws XMLStreamException xml stream exception 371 */ 372 public Foxml11DatastreamVersion(final DatastreamInfo dsInfo, 373 final XMLStreamReader reader) throws XMLStreamException { 374 this.dsInfo = dsInfo; 375 final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL", 376 "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE"); 377 id = dsAttributes.get("ID"); 378 label = dsAttributes.get("LABEL"); 379 created = dsAttributes.get("CREATED"); 380 createdInstant = created != null ? Instant.parse(created) : null; 381 mimeType = dsAttributes.get("MIMETYPE"); 382 altIds = dsAttributes.get("ALT_IDS"); 383 formatUri = dsAttributes.get("FORMAT_URI"); 384 size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1; 385 reader.next(); 386 387 while (reader.hasNext()) { 388 if (reader.isCharacters()) { 389 if (!reader.isWhiteSpace()) { 390 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 391 } else { 392 // skip whitespace... 393 } 394 } else if (reader.isStartElement()) { 395 final String localName = reader.getLocalName(); 396 if (localName.equals("contentDigest")) { 397 final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST"); 398 this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST")); 399 } else if (localName.equals("xmlContent")) { 400 // this XML fragment may not be valid out of context 401 // context, so write it out as a complete XML 402 // file... 403 reader.next(); 404 405 isInlineXml = true; 406 dsContent = new MemoryCachedContent(extractInlineXml()); 407 } else if (localName.equals("contentLocation")) { 408 final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE"); 409 if (attributes.get("TYPE").equals("INTERNAL_ID")) { 410 dsContent = idResolver.resolveInternalID(attributes.get("REF")); 411 } else { 412 try { 413 String ref = attributes.get("REF"); 414 if (ref.contains("local.fedora.server")) { 415 ref = ref.replace("local.fedora.server", localFedoraServer); 416 } 417 dsContent = new URLCachedContent(new URL(ref), fetcher); 418 } catch (final MalformedURLException e) { 419 throw new RuntimeException(e); 420 } 421 } 422 } else if (localName.equals("binaryContent")) { 423 try { 424 final File f = File.createTempFile("decoded", "file"); 425 tempFiles.add(f); 426 final Base64OutputStream out = new Base64OutputStream( 427 new BufferedOutputStream(new FileOutputStream(f)), false); 428 while (reader.next() == XMLStreamConstants.CHARACTERS) { 429 out.write(reader.getText().getBytes("UTF-8")); 430 } 431 out.flush(); 432 out.close(); 433 dsContent = new FileCachedContent(f); 434 } catch (final IOException e) { 435 throw new RuntimeException(e); 436 } 437 readUntilClosed("binaryContent", FOXML_NS); 438 } else { 439 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 440 } 441 } else if (reader.isEndElement()) { 442 if (reader.getLocalName().equals("datastreamVersion")) { 443 return; 444 } 445 } else { 446 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 447 + reader.getLocation().getLineNumber() + ", column " 448 + reader.getLocation().getColumnNumber() 449 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 450 } 451 reader.next(); 452 } 453 454 } 455 456 @Override 457 public Optional<File> getFile() { 458 return dsContent.getFile(); 459 } 460 461 private String extractInlineXml() throws XMLStreamException { 462 final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader); 463 while (eventReader.hasNext()) { 464 final XMLEvent event = eventReader.nextEvent(); 465 if (event.isEndElement() 466 && event.asEndElement().getName().getLocalPart().equals("xmlContent") 467 && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) { 468 break; 469 } 470 } 471 472 return inlineXml.removeFirst(); 473 } 474 475 private void validateInlineXml() { 476 if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) { 477 478 if (StringUtils.equals(contentDigest.getType(), "DISABLED")) { 479 LOG.warn("Datastream Digest DISABLED. Skipping digest validation"); 480 return; 481 } 482 483 final var transformedXml = transformInlineXmlForChecksum(); 484 final var digest = DigestUtils.getDigest(contentDigest.getType()); 485 final var digestBytes = DigestUtils.digest(digest, transformedXml); 486 final var digestHex = Hex.encodeHexString(digestBytes); 487 488 if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) { 489 throw new RuntimeException(String.format( 490 "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s", 491 dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(), 492 contentDigest.getType(), contentDigest.getDigest(), digestHex)); 493 } 494 } 495 } 496 497 /** 498 * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/ 499 * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/ 500 * DatastreamXMLMetadata.java#L92 501 * 502 * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order. 503 * 504 * @return the xml in the format Fedora 3 used to calculate digests 505 */ 506 private byte[] transformInlineXmlForChecksum() { 507 try { 508 // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :( 509 final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" 510 + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8); 511 512 final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8); 513 final var source = new InputSource(isReader); 514 source.setEncoding("UTF-8"); 515 516 final Document doc = documentBuilder.parse(source); 517 518 final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false); 519 // indent == 0 means add no indenting 520 fmt.setIndent(0); 521 // default line width is 72, but only applies when indenting 522 fmt.setLineWidth(0); 523 fmt.setPreserveSpace(false); 524 525 final StringWriter out = new StringWriter(); 526 final XMLSerializer ser = new XMLSerializer(out, fmt); 527 ser.serialize(doc); 528 out.close(); 529 530 final var baos = new ByteArrayOutputStream(); 531 final var br = new BufferedReader(new StringReader(out.toString())); 532 String line; 533 final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8)); 534 while ((line = br.readLine()) != null) { 535 line = line.trim(); 536 outStream.append(line); 537 } 538 outStream.close(); 539 540 return baos.toByteArray(); 541 } catch (IOException e) { 542 throw new UncheckedIOException(e); 543 } catch (SAXException e) { 544 try { 545 LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream())); 546 } catch (IOException e2) { 547 // swallow 548 } 549 throw new RuntimeException(e); 550 } 551 } 552 553 @Override 554 public DatastreamInfo getDatastreamInfo() { 555 return dsInfo; 556 } 557 558 @Override 559 public String getVersionId() { 560 return id; 561 } 562 563 @Override 564 public String getMimeType() { 565 return mimeType; 566 } 567 568 @Override 569 public String getLabel() { 570 return label; 571 } 572 573 @Override 574 public String getCreated() { 575 return created; 576 } 577 578 @Override 579 public Instant getCreatedInstant() { 580 return createdInstant; 581 } 582 583 @Override 584 public String getAltIds() { 585 return altIds; 586 } 587 588 @Override 589 public String getFormatUri() { 590 return formatUri; 591 } 592 593 @Override 594 public long getSize() { 595 return size; 596 } 597 598 @Override 599 public ContentDigest getContentDigest() { 600 // The digests for inline xml do not match what is stored in the FOXML and should not be returned here. 601 if (isInlineXml) { 602 return null; 603 } 604 return contentDigest; 605 } 606 607 @Override 608 public InputStream getContent() throws IOException { 609 return dsContent.getInputStream(); 610 } 611 612 @Override 613 public String getExternalOrRedirectURL() { 614 if (dsContent instanceof URLCachedContent) { 615 return ((URLCachedContent) dsContent).getURL().toString(); 616 } else { 617 throw new IllegalStateException(); 618 } 619 } 620 621 @Override 622 public boolean isFirstVersionIn(final ObjectReference obj) { 623 final List<DatastreamVersion> datastreams = 624 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 625 return datastreams.indexOf(this) == 0; 626 } 627 628 @Override 629 public boolean isLastVersionIn(final ObjectReference obj) { 630 final List<DatastreamVersion> datastreams = 631 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 632 return datastreams.indexOf(this) == datastreams.size() - 1; 633 } 634 } 635 636 private static Map<String, String> getAttributes(final XMLStreamReader r, 637 final String ... allowedNames) { 638 final HashMap<String, String> result = new HashMap<String, String>(); 639 final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames)); 640 for (int i = 0; i < r.getAttributeCount(); i ++) { 641 final String localName = r.getAttributeLocalName(i); 642 final String value = r.getAttributeValue(i); 643 if (allowed.contains(localName)) { 644 result.put(localName, value); 645 } else { 646 System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\""); 647 } 648 } 649 return result; 650 651 } 652 653}