001/** 002 * The contents of this file are subject to the license and copyright 003 * detailed in the LICENSE and NOTICE files at the root of the source 004 * tree. 005 * 006 */ 007package org.fcrepo.migration.foxml; 008 009import io.micrometer.core.instrument.Metrics; 010import io.micrometer.core.instrument.Timer; 011import org.apache.commons.codec.binary.Base64OutputStream; 012import org.apache.commons.codec.binary.Hex; 013import org.apache.commons.codec.digest.DigestUtils; 014import org.apache.commons.io.FileUtils; 015import org.apache.commons.io.IOUtils; 016import org.apache.commons.lang3.StringUtils; 017import org.apache.xml.serialize.OutputFormat; 018import org.apache.xml.serialize.XMLSerializer; 019import org.codehaus.stax2.XMLInputFactory2; 020import org.fcrepo.migration.ContentDigest; 021import org.fcrepo.migration.DatastreamInfo; 022import org.fcrepo.migration.DatastreamVersion; 023import org.fcrepo.migration.DefaultContentDigest; 024import org.fcrepo.migration.DefaultObjectInfo; 025import org.fcrepo.migration.FedoraObjectProcessor; 026import org.fcrepo.migration.ObjectInfo; 027import org.fcrepo.migration.ObjectProperties; 028import org.fcrepo.migration.ObjectReference; 029import org.fcrepo.migration.StreamingFedoraObjectHandler; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032import org.w3c.dom.Document; 033import org.xml.sax.InputSource; 034import org.xml.sax.SAXException; 035 036import javax.xml.bind.JAXBContext; 037import javax.xml.bind.JAXBElement; 038import javax.xml.bind.JAXBException; 039import javax.xml.bind.Unmarshaller; 040import javax.xml.parsers.DocumentBuilder; 041import javax.xml.parsers.DocumentBuilderFactory; 042import javax.xml.parsers.ParserConfigurationException; 043import javax.xml.stream.XMLEventReader; 044import javax.xml.stream.XMLInputFactory; 045import javax.xml.stream.XMLStreamConstants; 046import javax.xml.stream.XMLStreamException; 047import javax.xml.stream.XMLStreamReader; 048import javax.xml.stream.events.XMLEvent; 049import java.io.BufferedInputStream; 050import java.io.BufferedOutputStream; 051import java.io.BufferedReader; 052import java.io.ByteArrayOutputStream; 053import java.io.File; 054import java.io.FileInputStream; 055import java.io.FileNotFoundException; 056import java.io.FileOutputStream; 057import java.io.IOException; 058import java.io.InputStream; 059import java.io.InputStreamReader; 060import java.io.OutputStreamWriter; 061import java.io.PrintWriter; 062import java.io.StringReader; 063import java.io.StringWriter; 064import java.io.UncheckedIOException; 065import java.net.MalformedURLException; 066import java.net.URL; 067import java.nio.charset.StandardCharsets; 068import java.util.ArrayList; 069import java.util.Arrays; 070import java.util.HashMap; 071import java.util.HashSet; 072import java.util.LinkedList; 073import java.util.List; 074import java.util.Map; 075import java.util.Optional; 076import java.util.Set; 077import java.util.regex.Pattern; 078 079/** 080 * A FedoraObjectProcessor implementation that uses the STaX API to process 081 * a FOXML XML InputStream. 082 * @author mdurbin 083 */ 084public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor { 085 086 private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class); 087 088 private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>", 089 Pattern.DOTALL); 090 091 private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#"; 092 093 private static final String METRIC_NAME = "fcrepo.storage.foxml.object"; 094 private static final String OPERATION = "operation"; 095 private static final Timer processObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "processObject"); 096 private static final Timer completeObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "completeObject"); 097 098 private URLFetcher fetcher; 099 100 private String localFedoraServer; 101 102 private InternalIDResolver idResolver; 103 104 private File file; 105 106 private InputStream stream; 107 108 private XMLStreamReader reader; 109 110 private DocumentBuilder documentBuilder; 111 112 private List<File> tempFiles; 113 114 private LinkedList<String> inlineXml; 115 116 /** 117 * The basic object information read from the XML stream at construction 118 * time by processing the root XML element and its attributes. 119 */ 120 private ObjectInfo objectInfo; 121 122 /** 123 * foxml input stream fedora object processor. 124 * @param file the FOXML file 125 * @param fetcher the fetcher 126 * @param resolver the resolver 127 * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server 128 * from which the content exposed by the "is" parameter comes. 129 * @throws XMLStreamException xml stream exception 130 */ 131 public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher, 132 final InternalIDResolver resolver, final String localFedoraServer) 133 throws XMLStreamException, FileNotFoundException { 134 this.file = file; 135 this.fetcher = fetcher; 136 this.idResolver = resolver; 137 this.localFedoraServer = localFedoraServer; 138 final XMLInputFactory factory = XMLInputFactory.newFactory(); 139 stream = new BufferedInputStream(new FileInputStream(file)); 140 reader = factory.createXMLStreamReader(stream); 141 reader.nextTag(); 142 final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation"); 143 objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath()); 144 while (reader.next() == XMLStreamConstants.CHARACTERS) { 145 } 146 147 tempFiles = new ArrayList<File>(); 148 149 final var builderFactory = DocumentBuilderFactory.newInstance(); 150 builderFactory.setNamespaceAware(true); 151 builderFactory.setIgnoringComments(false); 152 try { 153 documentBuilder = builderFactory.newDocumentBuilder(); 154 } catch (ParserConfigurationException e) { 155 throw new RuntimeException(e); 156 } 157 158 try { 159 inlineXml = new LinkedList<>(); 160 final var content = FileUtils.readFileToString(file); 161 final var matcher = INLINE_PATTERN.matcher(content); 162 while (matcher.find()) { 163 inlineXml.add(matcher.group(1)); 164 } 165 } catch (IOException e) { 166 throw new UncheckedIOException(e); 167 } 168 } 169 170 @Override 171 public ObjectInfo getObjectInfo() { 172 return objectInfo; 173 } 174 175 @Override 176 public void processObject(final StreamingFedoraObjectHandler handler) { 177 final var stopwatch = Timer.start(); 178 handler.beginObject(objectInfo); 179 Foxml11DatastreamInfo dsInfo = null; 180 try { 181 handler.processObjectProperties(readProperties()); 182 while (reader.hasNext()) { 183 if (reader.isCharacters()) { 184 if (!reader.isWhiteSpace()) { 185 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 186 } else { 187 // skip whitespace... 188 } 189 } else if (reader.isStartElement()) { 190 if (reader.getLocalName().equals("datastream") 191 && reader.getNamespaceURI().equals(FOXML_NS)) { 192 dsInfo = new Foxml11DatastreamInfo(objectInfo, reader); 193 } else if (reader.getLocalName().equals("datastreamVersion")) { 194 final var v = new Foxml11DatastreamVersion(dsInfo, reader); 195 v.validateInlineXml(); 196 handler.processDatastreamVersion(v); 197 } else { 198 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 199 } 200 } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) { 201 dsInfo = null; 202 } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) { 203 // end of document.... 204 } else { 205 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 206 + reader.getLocation().getLineNumber() + ", column " 207 + reader.getLocation().getColumnNumber() 208 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 209 } 210 reader.next(); 211 } 212 } catch (Exception e) { 213 abort(handler, e); 214 } finally { 215 stopwatch.stop(processObjectTimer); 216 } 217 218 completeObjectTimer.record(() -> complete(handler)); 219 } 220 221 private void complete(final StreamingFedoraObjectHandler handler) { 222 try { 223 handler.completeObject(objectInfo); 224 cleanUpTempFiles(); 225 } catch (Exception e) { 226 abort(handler, e); 227 } 228 } 229 230 private void abort(final StreamingFedoraObjectHandler handler, final Exception e) { 231 try { 232 handler.abortObject(objectInfo); 233 if (e instanceof RuntimeException) { 234 throw (RuntimeException) e; 235 } 236 throw new RuntimeException(e); 237 } finally { 238 cleanUpTempFiles(); 239 close(); 240 } 241 } 242 243 /** 244 * Close resources associated to the processor 245 */ 246 public void close() { 247 try { 248 reader.close(); 249 } catch (final XMLStreamException e) { 250 LOG.warn("Failed to close reader cleanly", e); 251 } 252 try { 253 stream.close(); 254 } catch (IOException e) { 255 LOG.warn("Failed to close file cleanly", e); 256 } 257 } 258 259 private void cleanUpTempFiles() { 260 for (final File f : this.tempFiles) { 261 if (f.exists()) { 262 f.delete(); 263 } 264 } 265 } 266 267 private ObjectProperties readProperties() throws JAXBException, XMLStreamException { 268 final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class); 269 final Unmarshaller unmarshaller = jc.createUnmarshaller(); 270 final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class); 271 final FoxmlObjectProperties properties = p.getValue(); 272 return properties; 273 } 274 275 private void readUntilClosed(final String name, final String namespace) throws XMLStreamException { 276 while (reader.hasNext()) { 277 if (reader.isEndElement() && reader.getLocalName().equals(name) 278 && reader.getNamespaceURI().equals(namespace)) { 279 return; 280 } else { 281 // skip all other stuff.... 282 } 283 reader.next(); 284 } 285 } 286 287 private class Foxml11DatastreamInfo implements DatastreamInfo { 288 289 private String id; 290 291 private String controlGroup; 292 293 private String fedoraUri; 294 295 private String state; 296 297 private boolean versionable; 298 299 private ObjectInfo objectInfo; 300 301 public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) { 302 this.objectInfo = objectInfo; 303 final Map<String, String> attributes 304 = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE"); 305 id = attributes.get("ID"); 306 controlGroup = attributes.get("CONTROL_GROUP"); 307 fedoraUri = attributes.get("FEDORA_URI"); 308 state = attributes.get("STATE"); 309 versionable = Boolean.valueOf(attributes.get("VERSIONABLE")); 310 } 311 312 @Override 313 public ObjectInfo getObjectInfo() { 314 return objectInfo; 315 } 316 317 @Override 318 public String getDatastreamId() { 319 return id; 320 } 321 322 @Override 323 public String getControlGroup() { 324 return controlGroup; 325 } 326 327 @Override 328 public String getFedoraURI() { 329 return fedoraUri; 330 } 331 332 @Override 333 public String getState() { 334 return state; 335 } 336 337 @Override 338 public boolean getVersionable() { 339 return versionable; 340 } 341 } 342 343 public class Foxml11DatastreamVersion implements DatastreamVersion { 344 345 private DatastreamInfo dsInfo; 346 347 private String id; 348 private String label; 349 private String created; 350 private String mimeType; 351 private String altIds; 352 private String formatUri; 353 private long size; 354 private ContentDigest contentDigest; 355 private CachedContent dsContent; 356 private boolean isInlineXml = false; 357 358 /** 359 * foxml datastream version. 360 * @param dsInfo the datastream information 361 * @param reader the reader 362 * @throws XMLStreamException xml stream exception 363 */ 364 public Foxml11DatastreamVersion(final DatastreamInfo dsInfo, 365 final XMLStreamReader reader) throws XMLStreamException { 366 this.dsInfo = dsInfo; 367 final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL", 368 "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE"); 369 id = dsAttributes.get("ID"); 370 label = dsAttributes.get("LABEL"); 371 created = dsAttributes.get("CREATED"); 372 mimeType = dsAttributes.get("MIMETYPE"); 373 altIds = dsAttributes.get("ALT_IDS"); 374 formatUri = dsAttributes.get("FORMAT_URI"); 375 size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1; 376 reader.next(); 377 378 while (reader.hasNext()) { 379 if (reader.isCharacters()) { 380 if (!reader.isWhiteSpace()) { 381 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 382 } else { 383 // skip whitespace... 384 } 385 } else if (reader.isStartElement()) { 386 final String localName = reader.getLocalName(); 387 if (localName.equals("contentDigest")) { 388 final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST"); 389 this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST")); 390 } else if (localName.equals("xmlContent")) { 391 // this XML fragment may not be valid out of context 392 // context, so write it out as a complete XML 393 // file... 394 reader.next(); 395 396 isInlineXml = true; 397 dsContent = new MemoryCachedContent(extractInlineXml()); 398 } else if (localName.equals("contentLocation")) { 399 final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE"); 400 if (attributes.get("TYPE").equals("INTERNAL_ID")) { 401 dsContent = idResolver.resolveInternalID(attributes.get("REF")); 402 } else { 403 try { 404 String ref = attributes.get("REF"); 405 if (ref.contains("local.fedora.server")) { 406 ref = ref.replace("local.fedora.server", localFedoraServer); 407 } 408 dsContent = new URLCachedContent(new URL(ref), fetcher); 409 } catch (final MalformedURLException e) { 410 throw new RuntimeException(e); 411 } 412 } 413 } else if (localName.equals("binaryContent")) { 414 try { 415 final File f = File.createTempFile("decoded", "file"); 416 tempFiles.add(f); 417 final Base64OutputStream out = new Base64OutputStream( 418 new BufferedOutputStream(new FileOutputStream(f)), false); 419 while (reader.next() == XMLStreamConstants.CHARACTERS) { 420 out.write(reader.getText().getBytes("UTF-8")); 421 } 422 out.flush(); 423 out.close(); 424 dsContent = new FileCachedContent(f); 425 } catch (final IOException e) { 426 throw new RuntimeException(e); 427 } 428 readUntilClosed("binaryContent", FOXML_NS); 429 } else { 430 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 431 } 432 } else if (reader.isEndElement()) { 433 if (reader.getLocalName().equals("datastreamVersion")) { 434 return; 435 } 436 } else { 437 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 438 + reader.getLocation().getLineNumber() + ", column " 439 + reader.getLocation().getColumnNumber() 440 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 441 } 442 reader.next(); 443 } 444 445 } 446 447 @Override 448 public Optional<File> getFile() { 449 return dsContent.getFile(); 450 } 451 452 private String extractInlineXml() throws XMLStreamException { 453 final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader); 454 while (eventReader.hasNext()) { 455 final XMLEvent event = eventReader.nextEvent(); 456 if (event.isEndElement() 457 && event.asEndElement().getName().getLocalPart().equals("xmlContent") 458 && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) { 459 break; 460 } 461 } 462 463 return inlineXml.removeFirst(); 464 } 465 466 private void validateInlineXml() { 467 if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) { 468 final var transformedXml = transformInlineXmlForChecksum(); 469 final var digest = DigestUtils.getDigest(contentDigest.getType()); 470 final var digestBytes = DigestUtils.digest(digest, transformedXml); 471 final var digestHex = Hex.encodeHexString(digestBytes); 472 473 if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) { 474 throw new RuntimeException(String.format( 475 "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s", 476 dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(), 477 contentDigest.getType(), contentDigest.getDigest(), digestHex)); 478 } 479 } 480 } 481 482 /** 483 * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/ 484 * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/ 485 * DatastreamXMLMetadata.java#L92 486 * 487 * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order. 488 * 489 * @return the xml in the format Fedora 3 used to calculate digests 490 */ 491 private byte[] transformInlineXmlForChecksum() { 492 try { 493 // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :( 494 final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" 495 + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8); 496 497 final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8); 498 final var source = new InputSource(isReader); 499 source.setEncoding("UTF-8"); 500 501 final Document doc = documentBuilder.parse(source); 502 503 final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false); 504 // indent == 0 means add no indenting 505 fmt.setIndent(0); 506 // default line width is 72, but only applies when indenting 507 fmt.setLineWidth(0); 508 fmt.setPreserveSpace(false); 509 510 final StringWriter out = new StringWriter(); 511 final XMLSerializer ser = new XMLSerializer(out, fmt); 512 ser.serialize(doc); 513 out.close(); 514 515 final var baos = new ByteArrayOutputStream(); 516 final var br = new BufferedReader(new StringReader(out.toString())); 517 String line; 518 final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8)); 519 while ((line = br.readLine()) != null) { 520 line = line.trim(); 521 outStream.append(line); 522 } 523 outStream.close(); 524 525 return baos.toByteArray(); 526 } catch (IOException e) { 527 throw new UncheckedIOException(e); 528 } catch (SAXException e) { 529 try { 530 LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream())); 531 } catch (IOException e2) { 532 // swallow 533 } 534 throw new RuntimeException(e); 535 } 536 } 537 538 @Override 539 public DatastreamInfo getDatastreamInfo() { 540 return dsInfo; 541 } 542 543 @Override 544 public String getVersionId() { 545 return id; 546 } 547 548 @Override 549 public String getMimeType() { 550 return mimeType; 551 } 552 553 @Override 554 public String getLabel() { 555 return label; 556 } 557 558 @Override 559 public String getCreated() { 560 return created; 561 } 562 563 @Override 564 public String getAltIds() { 565 return altIds; 566 } 567 568 @Override 569 public String getFormatUri() { 570 return formatUri; 571 } 572 573 @Override 574 public long getSize() { 575 return size; 576 } 577 578 @Override 579 public ContentDigest getContentDigest() { 580 // The digests for inline xml do not match what is stored in the FOXML and should not be returned here. 581 if (isInlineXml) { 582 return null; 583 } 584 return contentDigest; 585 } 586 587 @Override 588 public InputStream getContent() throws IOException { 589 return dsContent.getInputStream(); 590 } 591 592 @Override 593 public String getExternalOrRedirectURL() { 594 if (dsContent instanceof URLCachedContent) { 595 return ((URLCachedContent) dsContent).getURL().toString(); 596 } else { 597 throw new IllegalStateException(); 598 } 599 } 600 601 @Override 602 public boolean isFirstVersionIn(final ObjectReference obj) { 603 final List<DatastreamVersion> datastreams = 604 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 605 return datastreams.indexOf(this) == 0; 606 } 607 608 @Override 609 public boolean isLastVersionIn(final ObjectReference obj) { 610 final List<DatastreamVersion> datastreams = 611 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 612 return datastreams.indexOf(this) == datastreams.size() - 1; 613 } 614 } 615 616 private static Map<String, String> getAttributes(final XMLStreamReader r, 617 final String ... allowedNames) { 618 final HashMap<String, String> result = new HashMap<String, String>(); 619 final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames)); 620 for (int i = 0; i < r.getAttributeCount(); i ++) { 621 final String localName = r.getAttributeLocalName(i); 622 final String value = r.getAttributeValue(i); 623 if (allowed.contains(localName)) { 624 result.put(localName, value); 625 } else { 626 System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\""); 627 } 628 } 629 return result; 630 631 } 632 633}