001/** 002 * The contents of this file are subject to the license and copyright 003 * detailed in the LICENSE and NOTICE files at the root of the source 004 * tree. 005 * 006 */ 007package org.fcrepo.migration.foxml; 008 009import io.micrometer.core.instrument.Metrics; 010import io.micrometer.core.instrument.Timer; 011import org.apache.commons.codec.binary.Base64OutputStream; 012import org.apache.commons.codec.binary.Hex; 013import org.apache.commons.codec.digest.DigestUtils; 014import org.apache.commons.io.FileUtils; 015import org.apache.commons.io.IOUtils; 016import org.apache.commons.lang3.StringUtils; 017import org.apache.xml.serialize.OutputFormat; 018import org.apache.xml.serialize.XMLSerializer; 019import org.codehaus.stax2.XMLInputFactory2; 020import org.fcrepo.migration.ContentDigest; 021import org.fcrepo.migration.DatastreamInfo; 022import org.fcrepo.migration.DatastreamVersion; 023import org.fcrepo.migration.DefaultContentDigest; 024import org.fcrepo.migration.DefaultObjectInfo; 025import org.fcrepo.migration.FedoraObjectProcessor; 026import org.fcrepo.migration.ObjectInfo; 027import org.fcrepo.migration.ObjectProperties; 028import org.fcrepo.migration.ObjectReference; 029import org.fcrepo.migration.StreamingFedoraObjectHandler; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032import org.w3c.dom.Document; 033import org.xml.sax.InputSource; 034import org.xml.sax.SAXException; 035 036import javax.xml.bind.JAXBContext; 037import javax.xml.bind.JAXBElement; 038import javax.xml.bind.JAXBException; 039import javax.xml.bind.Unmarshaller; 040import javax.xml.parsers.DocumentBuilder; 041import javax.xml.parsers.DocumentBuilderFactory; 042import javax.xml.parsers.ParserConfigurationException; 043import javax.xml.stream.XMLEventReader; 044import javax.xml.stream.XMLInputFactory; 045import javax.xml.stream.XMLStreamConstants; 046import javax.xml.stream.XMLStreamException; 047import javax.xml.stream.XMLStreamReader; 048import javax.xml.stream.events.XMLEvent; 049import java.io.BufferedInputStream; 050import java.io.BufferedOutputStream; 051import java.io.BufferedReader; 052import java.io.ByteArrayOutputStream; 053import java.io.File; 054import java.io.FileInputStream; 055import java.io.FileNotFoundException; 056import java.io.FileOutputStream; 057import java.io.IOException; 058import java.io.InputStream; 059import java.io.InputStreamReader; 060import java.io.OutputStreamWriter; 061import java.io.PrintWriter; 062import java.io.StringReader; 063import java.io.StringWriter; 064import java.io.UncheckedIOException; 065import java.net.MalformedURLException; 066import java.net.URL; 067import java.nio.charset.StandardCharsets; 068import java.time.Instant; 069import java.util.ArrayList; 070import java.util.Arrays; 071import java.util.HashMap; 072import java.util.HashSet; 073import java.util.LinkedList; 074import java.util.List; 075import java.util.Map; 076import java.util.Optional; 077import java.util.Set; 078import java.util.regex.Pattern; 079 080/** 081 * A FedoraObjectProcessor implementation that uses the STaX API to process 082 * a FOXML XML InputStream. 083 * @author mdurbin 084 */ 085public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor { 086 087 private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class); 088 089 private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>", 090 Pattern.DOTALL); 091 092 private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#"; 093 094 private static final String METRIC_NAME = "fcrepo.storage.foxml.object"; 095 private static final String OPERATION = "operation"; 096 private static final Timer processObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "processObject"); 097 private static final Timer completeObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "completeObject"); 098 099 private URLFetcher fetcher; 100 101 private String localFedoraServer; 102 103 private InternalIDResolver idResolver; 104 105 private File file; 106 107 private InputStream stream; 108 109 private XMLStreamReader reader; 110 111 private DocumentBuilder documentBuilder; 112 113 private List<File> tempFiles; 114 115 private LinkedList<String> inlineXml; 116 117 /** 118 * The basic object information read from the XML stream at construction 119 * time by processing the root XML element and its attributes. 120 */ 121 private ObjectInfo objectInfo; 122 123 /** 124 * foxml input stream fedora object processor. 125 * @param file the FOXML file 126 * @param fetcher the fetcher 127 * @param resolver the resolver 128 * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server 129 * from which the content exposed by the "is" parameter comes. 130 * @throws XMLStreamException xml stream exception 131 */ 132 public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher, 133 final InternalIDResolver resolver, final String localFedoraServer) 134 throws XMLStreamException, FileNotFoundException { 135 this.file = file; 136 this.fetcher = fetcher; 137 this.idResolver = resolver; 138 this.localFedoraServer = localFedoraServer; 139 final XMLInputFactory factory = XMLInputFactory.newFactory(); 140 stream = new BufferedInputStream(new FileInputStream(file)); 141 reader = factory.createXMLStreamReader(stream); 142 reader.nextTag(); 143 final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation"); 144 objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath()); 145 while (reader.next() == XMLStreamConstants.CHARACTERS) { 146 } 147 148 tempFiles = new ArrayList<File>(); 149 150 final var builderFactory = DocumentBuilderFactory.newInstance(); 151 builderFactory.setNamespaceAware(true); 152 builderFactory.setIgnoringComments(false); 153 try { 154 documentBuilder = builderFactory.newDocumentBuilder(); 155 } catch (ParserConfigurationException e) { 156 throw new RuntimeException(e); 157 } 158 159 try { 160 inlineXml = new LinkedList<>(); 161 final var content = FileUtils.readFileToString(file); 162 final var matcher = INLINE_PATTERN.matcher(content); 163 while (matcher.find()) { 164 inlineXml.add(matcher.group(1)); 165 } 166 } catch (IOException e) { 167 throw new UncheckedIOException(e); 168 } 169 } 170 171 @Override 172 public ObjectInfo getObjectInfo() { 173 return objectInfo; 174 } 175 176 @Override 177 public void processObject(final StreamingFedoraObjectHandler handler) { 178 final var stopwatch = Timer.start(); 179 handler.beginObject(objectInfo); 180 Foxml11DatastreamInfo dsInfo = null; 181 try { 182 handler.processObjectProperties(readProperties()); 183 while (reader.hasNext()) { 184 if (reader.isCharacters()) { 185 if (!reader.isWhiteSpace()) { 186 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 187 } else { 188 // skip whitespace... 189 } 190 } else if (reader.isStartElement()) { 191 if (reader.getLocalName().equals("datastream") 192 && reader.getNamespaceURI().equals(FOXML_NS)) { 193 dsInfo = new Foxml11DatastreamInfo(objectInfo, reader); 194 } else if (reader.getLocalName().equals("datastreamVersion")) { 195 final var v = new Foxml11DatastreamVersion(dsInfo, reader); 196 v.validateInlineXml(); 197 handler.processDatastreamVersion(v); 198 } else { 199 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 200 } 201 } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) { 202 dsInfo = null; 203 } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) { 204 // end of document.... 205 } else { 206 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 207 + reader.getLocation().getLineNumber() + ", column " 208 + reader.getLocation().getColumnNumber() 209 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 210 } 211 reader.next(); 212 } 213 } catch (Exception e) { 214 abort(handler, e); 215 } finally { 216 stopwatch.stop(processObjectTimer); 217 } 218 219 completeObjectTimer.record(() -> complete(handler)); 220 } 221 222 private void complete(final StreamingFedoraObjectHandler handler) { 223 try { 224 handler.completeObject(objectInfo); 225 cleanUpTempFiles(); 226 } catch (Exception e) { 227 abort(handler, e); 228 } 229 } 230 231 private void abort(final StreamingFedoraObjectHandler handler, final Exception e) { 232 try { 233 handler.abortObject(objectInfo); 234 if (e instanceof RuntimeException) { 235 throw (RuntimeException) e; 236 } 237 throw new RuntimeException(e); 238 } finally { 239 cleanUpTempFiles(); 240 close(); 241 } 242 } 243 244 /** 245 * Close resources associated to the processor 246 */ 247 public void close() { 248 try { 249 reader.close(); 250 } catch (final XMLStreamException e) { 251 LOG.warn("Failed to close reader cleanly", e); 252 } 253 try { 254 stream.close(); 255 } catch (IOException e) { 256 LOG.warn("Failed to close file cleanly", e); 257 } 258 } 259 260 private void cleanUpTempFiles() { 261 for (final File f : this.tempFiles) { 262 if (f.exists()) { 263 f.delete(); 264 } 265 } 266 } 267 268 private ObjectProperties readProperties() throws JAXBException, XMLStreamException { 269 final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class); 270 final Unmarshaller unmarshaller = jc.createUnmarshaller(); 271 final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class); 272 final FoxmlObjectProperties properties = p.getValue(); 273 return properties; 274 } 275 276 private void readUntilClosed(final String name, final String namespace) throws XMLStreamException { 277 while (reader.hasNext()) { 278 if (reader.isEndElement() && reader.getLocalName().equals(name) 279 && reader.getNamespaceURI().equals(namespace)) { 280 return; 281 } else { 282 // skip all other stuff.... 283 } 284 reader.next(); 285 } 286 } 287 288 private class Foxml11DatastreamInfo implements DatastreamInfo { 289 290 private String id; 291 292 private String controlGroup; 293 294 private String fedoraUri; 295 296 private String state; 297 298 private boolean versionable; 299 300 private ObjectInfo objectInfo; 301 302 public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) { 303 this.objectInfo = objectInfo; 304 final Map<String, String> attributes 305 = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE"); 306 id = attributes.get("ID"); 307 controlGroup = attributes.get("CONTROL_GROUP"); 308 fedoraUri = attributes.get("FEDORA_URI"); 309 state = attributes.get("STATE"); 310 versionable = Boolean.valueOf(attributes.get("VERSIONABLE")); 311 } 312 313 @Override 314 public ObjectInfo getObjectInfo() { 315 return objectInfo; 316 } 317 318 @Override 319 public String getDatastreamId() { 320 return id; 321 } 322 323 @Override 324 public String getControlGroup() { 325 return controlGroup; 326 } 327 328 @Override 329 public String getFedoraURI() { 330 return fedoraUri; 331 } 332 333 @Override 334 public String getState() { 335 return state; 336 } 337 338 @Override 339 public boolean getVersionable() { 340 return versionable; 341 } 342 } 343 344 public class Foxml11DatastreamVersion implements DatastreamVersion { 345 346 private DatastreamInfo dsInfo; 347 348 private String id; 349 private String label; 350 private String created; 351 private Instant createdInstant; 352 private String mimeType; 353 private String altIds; 354 private String formatUri; 355 private long size; 356 private ContentDigest contentDigest; 357 private CachedContent dsContent; 358 private boolean isInlineXml = false; 359 360 /** 361 * foxml datastream version. 362 * @param dsInfo the datastream information 363 * @param reader the reader 364 * @throws XMLStreamException xml stream exception 365 */ 366 public Foxml11DatastreamVersion(final DatastreamInfo dsInfo, 367 final XMLStreamReader reader) throws XMLStreamException { 368 this.dsInfo = dsInfo; 369 final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL", 370 "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE"); 371 id = dsAttributes.get("ID"); 372 label = dsAttributes.get("LABEL"); 373 created = dsAttributes.get("CREATED"); 374 createdInstant = created != null ? Instant.parse(created) : null; 375 mimeType = dsAttributes.get("MIMETYPE"); 376 altIds = dsAttributes.get("ALT_IDS"); 377 formatUri = dsAttributes.get("FORMAT_URI"); 378 size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1; 379 reader.next(); 380 381 while (reader.hasNext()) { 382 if (reader.isCharacters()) { 383 if (!reader.isWhiteSpace()) { 384 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 385 } else { 386 // skip whitespace... 387 } 388 } else if (reader.isStartElement()) { 389 final String localName = reader.getLocalName(); 390 if (localName.equals("contentDigest")) { 391 final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST"); 392 this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST")); 393 } else if (localName.equals("xmlContent")) { 394 // this XML fragment may not be valid out of context 395 // context, so write it out as a complete XML 396 // file... 397 reader.next(); 398 399 isInlineXml = true; 400 dsContent = new MemoryCachedContent(extractInlineXml()); 401 } else if (localName.equals("contentLocation")) { 402 final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE"); 403 if (attributes.get("TYPE").equals("INTERNAL_ID")) { 404 dsContent = idResolver.resolveInternalID(attributes.get("REF")); 405 } else { 406 try { 407 String ref = attributes.get("REF"); 408 if (ref.contains("local.fedora.server")) { 409 ref = ref.replace("local.fedora.server", localFedoraServer); 410 } 411 dsContent = new URLCachedContent(new URL(ref), fetcher); 412 } catch (final MalformedURLException e) { 413 throw new RuntimeException(e); 414 } 415 } 416 } else if (localName.equals("binaryContent")) { 417 try { 418 final File f = File.createTempFile("decoded", "file"); 419 tempFiles.add(f); 420 final Base64OutputStream out = new Base64OutputStream( 421 new BufferedOutputStream(new FileOutputStream(f)), false); 422 while (reader.next() == XMLStreamConstants.CHARACTERS) { 423 out.write(reader.getText().getBytes("UTF-8")); 424 } 425 out.flush(); 426 out.close(); 427 dsContent = new FileCachedContent(f); 428 } catch (final IOException e) { 429 throw new RuntimeException(e); 430 } 431 readUntilClosed("binaryContent", FOXML_NS); 432 } else { 433 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 434 } 435 } else if (reader.isEndElement()) { 436 if (reader.getLocalName().equals("datastreamVersion")) { 437 return; 438 } 439 } else { 440 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 441 + reader.getLocation().getLineNumber() + ", column " 442 + reader.getLocation().getColumnNumber() 443 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 444 } 445 reader.next(); 446 } 447 448 } 449 450 @Override 451 public Optional<File> getFile() { 452 return dsContent.getFile(); 453 } 454 455 private String extractInlineXml() throws XMLStreamException { 456 final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader); 457 while (eventReader.hasNext()) { 458 final XMLEvent event = eventReader.nextEvent(); 459 if (event.isEndElement() 460 && event.asEndElement().getName().getLocalPart().equals("xmlContent") 461 && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) { 462 break; 463 } 464 } 465 466 return inlineXml.removeFirst(); 467 } 468 469 private void validateInlineXml() { 470 if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) { 471 final var transformedXml = transformInlineXmlForChecksum(); 472 final var digest = DigestUtils.getDigest(contentDigest.getType()); 473 final var digestBytes = DigestUtils.digest(digest, transformedXml); 474 final var digestHex = Hex.encodeHexString(digestBytes); 475 476 if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) { 477 throw new RuntimeException(String.format( 478 "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s", 479 dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(), 480 contentDigest.getType(), contentDigest.getDigest(), digestHex)); 481 } 482 } 483 } 484 485 /** 486 * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/ 487 * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/ 488 * DatastreamXMLMetadata.java#L92 489 * 490 * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order. 491 * 492 * @return the xml in the format Fedora 3 used to calculate digests 493 */ 494 private byte[] transformInlineXmlForChecksum() { 495 try { 496 // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :( 497 final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" 498 + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8); 499 500 final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8); 501 final var source = new InputSource(isReader); 502 source.setEncoding("UTF-8"); 503 504 final Document doc = documentBuilder.parse(source); 505 506 final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false); 507 // indent == 0 means add no indenting 508 fmt.setIndent(0); 509 // default line width is 72, but only applies when indenting 510 fmt.setLineWidth(0); 511 fmt.setPreserveSpace(false); 512 513 final StringWriter out = new StringWriter(); 514 final XMLSerializer ser = new XMLSerializer(out, fmt); 515 ser.serialize(doc); 516 out.close(); 517 518 final var baos = new ByteArrayOutputStream(); 519 final var br = new BufferedReader(new StringReader(out.toString())); 520 String line; 521 final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8)); 522 while ((line = br.readLine()) != null) { 523 line = line.trim(); 524 outStream.append(line); 525 } 526 outStream.close(); 527 528 return baos.toByteArray(); 529 } catch (IOException e) { 530 throw new UncheckedIOException(e); 531 } catch (SAXException e) { 532 try { 533 LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream())); 534 } catch (IOException e2) { 535 // swallow 536 } 537 throw new RuntimeException(e); 538 } 539 } 540 541 @Override 542 public DatastreamInfo getDatastreamInfo() { 543 return dsInfo; 544 } 545 546 @Override 547 public String getVersionId() { 548 return id; 549 } 550 551 @Override 552 public String getMimeType() { 553 return mimeType; 554 } 555 556 @Override 557 public String getLabel() { 558 return label; 559 } 560 561 @Override 562 public String getCreated() { 563 return created; 564 } 565 566 @Override 567 public Instant getCreatedInstant() { 568 return createdInstant; 569 } 570 571 @Override 572 public String getAltIds() { 573 return altIds; 574 } 575 576 @Override 577 public String getFormatUri() { 578 return formatUri; 579 } 580 581 @Override 582 public long getSize() { 583 return size; 584 } 585 586 @Override 587 public ContentDigest getContentDigest() { 588 // The digests for inline xml do not match what is stored in the FOXML and should not be returned here. 589 if (isInlineXml) { 590 return null; 591 } 592 return contentDigest; 593 } 594 595 @Override 596 public InputStream getContent() throws IOException { 597 return dsContent.getInputStream(); 598 } 599 600 @Override 601 public String getExternalOrRedirectURL() { 602 if (dsContent instanceof URLCachedContent) { 603 return ((URLCachedContent) dsContent).getURL().toString(); 604 } else { 605 throw new IllegalStateException(); 606 } 607 } 608 609 @Override 610 public boolean isFirstVersionIn(final ObjectReference obj) { 611 final List<DatastreamVersion> datastreams = 612 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 613 return datastreams.indexOf(this) == 0; 614 } 615 616 @Override 617 public boolean isLastVersionIn(final ObjectReference obj) { 618 final List<DatastreamVersion> datastreams = 619 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 620 return datastreams.indexOf(this) == datastreams.size() - 1; 621 } 622 } 623 624 private static Map<String, String> getAttributes(final XMLStreamReader r, 625 final String ... allowedNames) { 626 final HashMap<String, String> result = new HashMap<String, String>(); 627 final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames)); 628 for (int i = 0; i < r.getAttributeCount(); i ++) { 629 final String localName = r.getAttributeLocalName(i); 630 final String value = r.getAttributeValue(i); 631 if (allowed.contains(localName)) { 632 result.put(localName, value); 633 } else { 634 System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\""); 635 } 636 } 637 return result; 638 639 } 640 641}