001/* 002 * Copyright 2015 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.fcrepo.migration.foxml; 017 018import io.micrometer.core.instrument.Metrics; 019import io.micrometer.core.instrument.Timer; 020import org.apache.commons.codec.binary.Base64OutputStream; 021import org.apache.commons.codec.binary.Hex; 022import org.apache.commons.codec.digest.DigestUtils; 023import org.apache.commons.io.FileUtils; 024import org.apache.commons.io.IOUtils; 025import org.apache.commons.lang3.StringUtils; 026import org.apache.xml.serialize.OutputFormat; 027import org.apache.xml.serialize.XMLSerializer; 028import org.codehaus.stax2.XMLInputFactory2; 029import org.fcrepo.migration.ContentDigest; 030import org.fcrepo.migration.DatastreamInfo; 031import org.fcrepo.migration.DatastreamVersion; 032import org.fcrepo.migration.DefaultContentDigest; 033import org.fcrepo.migration.DefaultObjectInfo; 034import org.fcrepo.migration.FedoraObjectProcessor; 035import org.fcrepo.migration.ObjectInfo; 036import org.fcrepo.migration.ObjectProperties; 037import org.fcrepo.migration.ObjectReference; 038import org.fcrepo.migration.StreamingFedoraObjectHandler; 039import org.slf4j.Logger; 040import org.slf4j.LoggerFactory; 041import org.w3c.dom.Document; 042import org.xml.sax.InputSource; 043import org.xml.sax.SAXException; 044 045import javax.xml.bind.JAXBContext; 046import javax.xml.bind.JAXBElement; 047import javax.xml.bind.JAXBException; 048import javax.xml.bind.Unmarshaller; 049import javax.xml.parsers.DocumentBuilder; 050import javax.xml.parsers.DocumentBuilderFactory; 051import javax.xml.parsers.ParserConfigurationException; 052import javax.xml.stream.XMLEventReader; 053import javax.xml.stream.XMLInputFactory; 054import javax.xml.stream.XMLStreamConstants; 055import javax.xml.stream.XMLStreamException; 056import javax.xml.stream.XMLStreamReader; 057import javax.xml.stream.events.XMLEvent; 058import java.io.BufferedInputStream; 059import java.io.BufferedReader; 060import java.io.ByteArrayOutputStream; 061import java.io.File; 062import java.io.FileInputStream; 063import java.io.FileNotFoundException; 064import java.io.FileOutputStream; 065import java.io.IOException; 066import java.io.InputStream; 067import java.io.InputStreamReader; 068import java.io.OutputStreamWriter; 069import java.io.PrintWriter; 070import java.io.StringReader; 071import java.io.StringWriter; 072import java.io.UncheckedIOException; 073import java.net.MalformedURLException; 074import java.net.URL; 075import java.nio.charset.StandardCharsets; 076import java.util.ArrayList; 077import java.util.Arrays; 078import java.util.HashMap; 079import java.util.HashSet; 080import java.util.LinkedList; 081import java.util.List; 082import java.util.Map; 083import java.util.Optional; 084import java.util.Set; 085import java.util.regex.Pattern; 086 087/** 088 * A FedoraObjectProcessor implementation that uses the STaX API to process 089 * a FOXML XML InputStream. 090 * @author mdurbin 091 */ 092public class FoxmlInputStreamFedoraObjectProcessor implements FedoraObjectProcessor { 093 094 private static final Logger LOG = LoggerFactory.getLogger(FoxmlInputStreamFedoraObjectProcessor.class); 095 096 private static final Pattern INLINE_PATTERN = Pattern.compile("<foxml:xmlContent>(.*?)</foxml:xmlContent>", 097 Pattern.DOTALL); 098 099 private static final String FOXML_NS = "info:fedora/fedora-system:def/foxml#"; 100 101 private static final String METRIC_NAME = "fcrepo.storage.foxml.object"; 102 private static final String OPERATION = "operation"; 103 private static final Timer processObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "processObject"); 104 private static final Timer completeObjectTimer = Metrics.timer(METRIC_NAME, OPERATION, "completeObject"); 105 106 private URLFetcher fetcher; 107 108 private String localFedoraServer; 109 110 private InternalIDResolver idResolver; 111 112 private File file; 113 114 private InputStream stream; 115 116 private XMLStreamReader reader; 117 118 private DocumentBuilder documentBuilder; 119 120 private List<File> tempFiles; 121 122 private LinkedList<String> inlineXml; 123 124 /** 125 * The basic object information read from the XML stream at construction 126 * time by processing the root XML element and its attributes. 127 */ 128 private ObjectInfo objectInfo; 129 130 /** 131 * foxml input stream fedora object processor. 132 * @param file the FOXML file 133 * @param fetcher the fetcher 134 * @param resolver the resolver 135 * @param localFedoraServer the host and port (formatted like "localhost:8080") of the fedora 3 server 136 * from which the content exposed by the "is" parameter comes. 137 * @throws XMLStreamException xml stream exception 138 */ 139 public FoxmlInputStreamFedoraObjectProcessor(final File file, final URLFetcher fetcher, 140 final InternalIDResolver resolver, final String localFedoraServer) 141 throws XMLStreamException, FileNotFoundException { 142 this.file = file; 143 this.fetcher = fetcher; 144 this.idResolver = resolver; 145 this.localFedoraServer = localFedoraServer; 146 final XMLInputFactory factory = XMLInputFactory.newFactory(); 147 stream = new BufferedInputStream(new FileInputStream(file)); 148 reader = factory.createXMLStreamReader(stream); 149 reader.nextTag(); 150 final Map<String, String> attributes = getAttributes(reader, "PID", "VERSION", "FEDORA_URI", "schemaLocation"); 151 objectInfo = new DefaultObjectInfo(attributes.get("PID"), attributes.get("FEDORA_URI"), this.file.toPath()); 152 while (reader.next() == XMLStreamConstants.CHARACTERS) { 153 } 154 155 tempFiles = new ArrayList<File>(); 156 157 final var builderFactory = DocumentBuilderFactory.newInstance(); 158 builderFactory.setNamespaceAware(true); 159 builderFactory.setIgnoringComments(false); 160 try { 161 documentBuilder = builderFactory.newDocumentBuilder(); 162 } catch (ParserConfigurationException e) { 163 throw new RuntimeException(e); 164 } 165 166 try { 167 inlineXml = new LinkedList<>(); 168 final var content = FileUtils.readFileToString(file); 169 final var matcher = INLINE_PATTERN.matcher(content); 170 while (matcher.find()) { 171 inlineXml.add(matcher.group(1)); 172 } 173 } catch (IOException e) { 174 throw new UncheckedIOException(e); 175 } 176 } 177 178 @Override 179 public ObjectInfo getObjectInfo() { 180 return objectInfo; 181 } 182 183 @Override 184 public void processObject(final StreamingFedoraObjectHandler handler) { 185 final var stopwatch = Timer.start(); 186 handler.beginObject(objectInfo); 187 Foxml11DatastreamInfo dsInfo = null; 188 try { 189 handler.processObjectProperties(readProperties()); 190 while (reader.hasNext()) { 191 if (reader.isCharacters()) { 192 if (!reader.isWhiteSpace()) { 193 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 194 } else { 195 // skip whitespace... 196 } 197 } else if (reader.isStartElement()) { 198 if (reader.getLocalName().equals("datastream") 199 && reader.getNamespaceURI().equals(FOXML_NS)) { 200 dsInfo = new Foxml11DatastreamInfo(objectInfo, reader); 201 } else if (reader.getLocalName().equals("datastreamVersion")) { 202 final var v = new Foxml11DatastreamVersion(dsInfo, reader); 203 v.validateInlineXml(); 204 handler.processDatastreamVersion(v); 205 } else { 206 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 207 } 208 } else if (reader.isEndElement() && (dsInfo != null && reader.getLocalName().equals("datastream"))) { 209 dsInfo = null; 210 } else if (reader.isEndElement() && reader.getLocalName().equals("digitalObject")) { 211 // end of document.... 212 } else { 213 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 214 + reader.getLocation().getLineNumber() + ", column " 215 + reader.getLocation().getColumnNumber() 216 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 217 } 218 reader.next(); 219 } 220 } catch (Exception e) { 221 abort(handler, e); 222 } finally { 223 stopwatch.stop(processObjectTimer); 224 } 225 226 completeObjectTimer.record(() -> complete(handler)); 227 } 228 229 private void complete(final StreamingFedoraObjectHandler handler) { 230 try { 231 handler.completeObject(objectInfo); 232 cleanUpTempFiles(); 233 } catch (Exception e) { 234 abort(handler, e); 235 } 236 } 237 238 private void abort(final StreamingFedoraObjectHandler handler, final Exception e) { 239 try { 240 handler.abortObject(objectInfo); 241 if (e instanceof RuntimeException) { 242 throw (RuntimeException) e; 243 } 244 throw new RuntimeException(e); 245 } finally { 246 cleanUpTempFiles(); 247 close(); 248 } 249 } 250 251 /** 252 * Close resources associated to the processor 253 */ 254 public void close() { 255 try { 256 reader.close(); 257 } catch (final XMLStreamException e) { 258 LOG.warn("Failed to close reader cleanly", e); 259 } 260 try { 261 stream.close(); 262 } catch (IOException e) { 263 LOG.warn("Failed to close file cleanly", e); 264 } 265 } 266 267 private void cleanUpTempFiles() { 268 for (final File f : this.tempFiles) { 269 if (f.exists()) { 270 f.delete(); 271 } 272 } 273 } 274 275 private ObjectProperties readProperties() throws JAXBException, XMLStreamException { 276 final JAXBContext jc = JAXBContext.newInstance(FoxmlObjectProperties.class); 277 final Unmarshaller unmarshaller = jc.createUnmarshaller(); 278 final JAXBElement<FoxmlObjectProperties> p = unmarshaller.unmarshal(reader, FoxmlObjectProperties.class); 279 final FoxmlObjectProperties properties = p.getValue(); 280 return properties; 281 } 282 283 private void readUntilClosed(final String name, final String namespace) throws XMLStreamException { 284 while (reader.hasNext()) { 285 if (reader.isEndElement() && reader.getLocalName().equals(name) 286 && reader.getNamespaceURI().equals(namespace)) { 287 return; 288 } else { 289 // skip all other stuff.... 290 } 291 reader.next(); 292 } 293 } 294 295 private class Foxml11DatastreamInfo implements DatastreamInfo { 296 297 private String id; 298 299 private String controlGroup; 300 301 private String fedoraUri; 302 303 private String state; 304 305 private boolean versionable; 306 307 private ObjectInfo objectInfo; 308 309 public Foxml11DatastreamInfo(final ObjectInfo objectInfo, final XMLStreamReader reader) { 310 this.objectInfo = objectInfo; 311 final Map<String, String> attributes 312 = getAttributes(reader, "ID", "CONTROL_GROUP", "FEDORA_URI", "STATE", "VERSIONABLE"); 313 id = attributes.get("ID"); 314 controlGroup = attributes.get("CONTROL_GROUP"); 315 fedoraUri = attributes.get("FEDORA_URI"); 316 state = attributes.get("STATE"); 317 versionable = Boolean.valueOf(attributes.get("VERSIONABLE")); 318 } 319 320 @Override 321 public ObjectInfo getObjectInfo() { 322 return objectInfo; 323 } 324 325 @Override 326 public String getDatastreamId() { 327 return id; 328 } 329 330 @Override 331 public String getControlGroup() { 332 return controlGroup; 333 } 334 335 @Override 336 public String getFedoraURI() { 337 return fedoraUri; 338 } 339 340 @Override 341 public String getState() { 342 return state; 343 } 344 345 @Override 346 public boolean getVersionable() { 347 return versionable; 348 } 349 } 350 351 public class Foxml11DatastreamVersion implements DatastreamVersion { 352 353 private DatastreamInfo dsInfo; 354 355 private String id; 356 private String label; 357 private String created; 358 private String mimeType; 359 private String altIds; 360 private String formatUri; 361 private long size; 362 private ContentDigest contentDigest; 363 private CachedContent dsContent; 364 private boolean isInlineXml = false; 365 366 /** 367 * foxml datastream version. 368 * @param dsInfo the datastream information 369 * @param reader the reader 370 * @throws XMLStreamException xml stream exception 371 */ 372 public Foxml11DatastreamVersion(final DatastreamInfo dsInfo, 373 final XMLStreamReader reader) throws XMLStreamException { 374 this.dsInfo = dsInfo; 375 final Map<String, String> dsAttributes = getAttributes(reader, "ID", "LABEL", 376 "CREATED", "MIMETYPE", "ALT_IDS", "FORMAT_URI", "SIZE"); 377 id = dsAttributes.get("ID"); 378 label = dsAttributes.get("LABEL"); 379 created = dsAttributes.get("CREATED"); 380 mimeType = dsAttributes.get("MIMETYPE"); 381 altIds = dsAttributes.get("ALT_IDS"); 382 formatUri = dsAttributes.get("FORMAT_URI"); 383 size = dsAttributes.containsKey("SIZE") ? Long.parseLong(dsAttributes.get("SIZE")) : -1; 384 reader.next(); 385 386 while (reader.hasNext()) { 387 if (reader.isCharacters()) { 388 if (!reader.isWhiteSpace()) { 389 throw new RuntimeException("Unexpected character data! \"" + reader.getText() + "\""); 390 } else { 391 // skip whitespace... 392 } 393 } else if (reader.isStartElement()) { 394 final String localName = reader.getLocalName(); 395 if (localName.equals("contentDigest")) { 396 final Map<String, String> attributes = getAttributes(reader, "TYPE", "DIGEST"); 397 this.contentDigest = new DefaultContentDigest(attributes.get("TYPE"), attributes.get("DIGEST")); 398 } else if (localName.equals("xmlContent")) { 399 // this XML fragment may not be valid out of context 400 // context, so write it out as a complete XML 401 // file... 402 reader.next(); 403 404 isInlineXml = true; 405 dsContent = new MemoryCachedContent(extractInlineXml()); 406 } else if (localName.equals("contentLocation")) { 407 final Map<String, String> attributes = getAttributes(reader, "REF", "TYPE"); 408 if (attributes.get("TYPE").equals("INTERNAL_ID")) { 409 dsContent = idResolver.resolveInternalID(attributes.get("REF")); 410 } else { 411 try { 412 String ref = attributes.get("REF"); 413 if (ref.contains("local.fedora.server")) { 414 ref = ref.replace("local.fedora.server", localFedoraServer); 415 } 416 dsContent = new URLCachedContent(new URL(ref), fetcher); 417 } catch (final MalformedURLException e) { 418 throw new RuntimeException(e); 419 } 420 } 421 } else if (localName.equals("binaryContent")) { 422 try { 423 final File f = File.createTempFile("decoded", "file"); 424 tempFiles.add(f); 425 final Base64OutputStream out = new Base64OutputStream(new FileOutputStream(f), false); 426 while (reader.next() == XMLStreamConstants.CHARACTERS) { 427 out.write(reader.getText().getBytes("UTF-8")); 428 } 429 out.flush(); 430 out.close(); 431 dsContent = new FileCachedContent(f); 432 } catch (final IOException e) { 433 throw new RuntimeException(e); 434 } 435 readUntilClosed("binaryContent", FOXML_NS); 436 } else { 437 throw new RuntimeException("Unexpected element! \"" + reader.getLocalName() + "\"!"); 438 } 439 } else if (reader.isEndElement()) { 440 if (reader.getLocalName().equals("datastreamVersion")) { 441 return; 442 } 443 } else { 444 throw new RuntimeException("Unexpected xml structure! \"" + reader.getEventType() + "\" at line " 445 + reader.getLocation().getLineNumber() + ", column " 446 + reader.getLocation().getColumnNumber() 447 + "!" + (reader.isCharacters() ? " \"" + reader.getText() + "\"" : "")); 448 } 449 reader.next(); 450 } 451 452 } 453 454 @Override 455 public Optional<File> getFile() { 456 return dsContent.getFile(); 457 } 458 459 private String extractInlineXml() throws XMLStreamException { 460 final XMLEventReader eventReader = XMLInputFactory2.newFactory().createXMLEventReader(reader); 461 while (eventReader.hasNext()) { 462 final XMLEvent event = eventReader.nextEvent(); 463 if (event.isEndElement() 464 && event.asEndElement().getName().getLocalPart().equals("xmlContent") 465 && event.asEndElement().getName().getNamespaceURI().equals(FOXML_NS)) { 466 break; 467 } 468 } 469 470 return inlineXml.removeFirst(); 471 } 472 473 private void validateInlineXml() { 474 if (isInlineXml && contentDigest != null && StringUtils.isNotBlank(contentDigest.getDigest())) { 475 final var transformedXml = transformInlineXmlForChecksum(); 476 final var digest = DigestUtils.getDigest(contentDigest.getType()); 477 final var digestBytes = DigestUtils.digest(digest, transformedXml); 478 final var digestHex = Hex.encodeHexString(digestBytes); 479 480 if (!digestHex.equalsIgnoreCase(contentDigest.getDigest())) { 481 throw new RuntimeException(String.format( 482 "Inline XML %s %s failed checksum validation. Expected %s: %s; Actual: %s", 483 dsInfo.getObjectInfo().getPid(), dsInfo.getDatastreamId(), 484 contentDigest.getType(), contentDigest.getDigest(), digestHex)); 485 } 486 } 487 } 488 489 /** 490 * This code is based on: https://github.com/fcrepo3/fcrepo-historical/blob/ 491 * e8a3be191cce6bbf8f55cd02bf1d52ac53425146/fcrepo-server/src/main/java/fedora/server/storage/types/ 492 * DatastreamXMLMetadata.java#L92 493 * 494 * This code MUST use these deprecated classes in order to generate the XML attributes in the expected order. 495 * 496 * @return the xml in the format Fedora 3 used to calculate digests 497 */ 498 private byte[] transformInlineXmlForChecksum() { 499 try { 500 // This MUST be done or else Windows will refuse to use the correct encoding!!! :( :( :( 501 final var xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" 502 + IOUtils.toString(dsContent.getInputStream(), StandardCharsets.UTF_8); 503 504 final var isReader = new InputStreamReader(IOUtils.toInputStream(xml), StandardCharsets.UTF_8); 505 final var source = new InputSource(isReader); 506 source.setEncoding("UTF-8"); 507 508 final Document doc = documentBuilder.parse(source); 509 510 final OutputFormat fmt = new OutputFormat("XML", "UTF-8", false); 511 // indent == 0 means add no indenting 512 fmt.setIndent(0); 513 // default line width is 72, but only applies when indenting 514 fmt.setLineWidth(0); 515 fmt.setPreserveSpace(false); 516 517 final StringWriter out = new StringWriter(); 518 final XMLSerializer ser = new XMLSerializer(out, fmt); 519 ser.serialize(doc); 520 out.close(); 521 522 final var baos = new ByteArrayOutputStream(); 523 final var br = new BufferedReader(new StringReader(out.toString())); 524 String line; 525 final PrintWriter outStream = new PrintWriter(new OutputStreamWriter(baos, StandardCharsets.UTF_8)); 526 while ((line = br.readLine()) != null) { 527 line = line.trim(); 528 outStream.append(line); 529 } 530 outStream.close(); 531 532 return baos.toByteArray(); 533 } catch (IOException e) { 534 throw new UncheckedIOException(e); 535 } catch (SAXException e) { 536 try { 537 LOG.error("Malformed inline XML: {}", IOUtils.toString(dsContent.getInputStream())); 538 } catch (IOException e2) { 539 // swallow 540 } 541 throw new RuntimeException(e); 542 } 543 } 544 545 @Override 546 public DatastreamInfo getDatastreamInfo() { 547 return dsInfo; 548 } 549 550 @Override 551 public String getVersionId() { 552 return id; 553 } 554 555 @Override 556 public String getMimeType() { 557 return mimeType; 558 } 559 560 @Override 561 public String getLabel() { 562 return label; 563 } 564 565 @Override 566 public String getCreated() { 567 return created; 568 } 569 570 @Override 571 public String getAltIds() { 572 return altIds; 573 } 574 575 @Override 576 public String getFormatUri() { 577 return formatUri; 578 } 579 580 @Override 581 public long getSize() { 582 return size; 583 } 584 585 @Override 586 public ContentDigest getContentDigest() { 587 // The digests for inline xml do not match what is stored in the FOXML and should not be returned here. 588 if (isInlineXml) { 589 return null; 590 } 591 return contentDigest; 592 } 593 594 @Override 595 public InputStream getContent() throws IOException { 596 return dsContent.getInputStream(); 597 } 598 599 @Override 600 public String getExternalOrRedirectURL() { 601 if (dsContent instanceof URLCachedContent) { 602 return ((URLCachedContent) dsContent).getURL().toString(); 603 } else { 604 throw new IllegalStateException(); 605 } 606 } 607 608 @Override 609 public boolean isFirstVersionIn(final ObjectReference obj) { 610 final List<DatastreamVersion> datastreams = 611 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 612 return datastreams.indexOf(this) == 0; 613 } 614 615 @Override 616 public boolean isLastVersionIn(final ObjectReference obj) { 617 final List<DatastreamVersion> datastreams = 618 obj.getDatastreamVersions(getDatastreamInfo().getDatastreamId()); 619 return datastreams.indexOf(this) == datastreams.size() - 1; 620 } 621 } 622 623 private static Map<String, String> getAttributes(final XMLStreamReader r, 624 final String ... allowedNames) { 625 final HashMap<String, String> result = new HashMap<String, String>(); 626 final Set<String> allowed = new HashSet<String>(Arrays.asList(allowedNames)); 627 for (int i = 0; i < r.getAttributeCount(); i ++) { 628 final String localName = r.getAttributeLocalName(i); 629 final String value = r.getAttributeValue(i); 630 if (allowed.contains(localName)) { 631 result.put(localName, value); 632 } else { 633 System.err.println("Unexpected attribute: " + localName + " = \"" + value + "\""); 634 } 635 } 636 return result; 637 638 } 639 640}