1   package nl.dedicon.pipeline.braille.calabash.impl;
2   
3   import com.xmlcalabash.core.XProcException;
4   import com.xmlcalabash.core.XProcRuntime;
5   import com.xmlcalabash.core.XProcStep;
6   import com.xmlcalabash.io.ReadablePipe;
7   import com.xmlcalabash.io.WritablePipe;
8   import com.xmlcalabash.library.DefaultStep;
9   import com.xmlcalabash.runtime.XAtomicStep;
10  import java.io.StringReader;
11  import java.time.LocalDate;
12  import java.time.format.DateTimeFormatter;
13  import java.util.ArrayList;
14  import java.util.Collection;
15  import java.util.Collections;
16  import java.util.List;
17  import java.util.Objects;
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  import javax.xml.transform.stream.StreamSource;
21  import static net.sf.saxon.s9api.Axis.CHILD;
22  import net.sf.saxon.s9api.DocumentBuilder;
23  import net.sf.saxon.s9api.QName;
24  import net.sf.saxon.s9api.SaxonApiException;
25  import net.sf.saxon.s9api.XdmItem;
26  import net.sf.saxon.s9api.XdmNode;
27  import net.sf.saxon.s9api.XdmSequenceIterator;
28  import nl.dedicon.pipeline.braille.model.Book;
29  import nl.dedicon.pipeline.braille.model.Page;
30  import nl.dedicon.pipeline.braille.model.Section;
31  import nl.dedicon.pipeline.braille.model.Volume;
32  import org.apache.commons.lang3.StringUtils;
33  import org.daisy.braille.api.embosser.FileFormat;
34  import org.daisy.common.xproc.calabash.XProcStepProvider;
35  import org.daisy.pipeline.braille.common.Provider.util.MemoizingProvider;
36  import static org.daisy.pipeline.braille.common.Provider.util.dispatch;
37  import static org.daisy.pipeline.braille.common.Provider.util.memoize;
38  import org.daisy.pipeline.braille.common.Query;
39  import static org.daisy.pipeline.braille.common.Query.util.mutableQuery;
40  import static org.daisy.pipeline.braille.common.Query.util.query;
41  import org.daisy.pipeline.braille.pef.FileFormatProvider;
42  import org.osgi.service.component.annotations.Component;
43  import org.osgi.service.component.annotations.Reference;
44  import org.osgi.service.component.annotations.ReferenceCardinality;
45  import org.osgi.service.component.annotations.ReferencePolicy;
46  import org.slf4j.Logger;
47  import org.slf4j.LoggerFactory;
48  
49  /**
50   * XProc step for metadata
51   * 
52   * @author Paul Rambags
53   */
54  public class MetadataStep extends DefaultStep {
55  
56      private static final Logger logger = LoggerFactory.getLogger(MetadataStep.class);
57  
58      private static final QName _xquery = new QName("xquery");
59      private static final QName _identifier = new QName("identifier");
60      private static final QName _brf_file_extension = new QName("brf-file-extension");
61      private static final QName _brf_file_format = new QName("brf-file-format");
62      private static final QName _brf_name_pattern = new QName("brf-name-pattern");
63      private static final QName _brf_number_width = new QName("brf-number-width");
64      private static final QName _optional_date = new QName("optional-date");
65  
66      private static final String PEF_NAMESPACE = "http://www.daisy.org/ns/2008/pef";
67      private static final String BRAILLE_DIGITS = "⠚⠁⠃⠉⠙⠑⠋⠛⠓⠊";
68      // whitespace (optional from print number)(optional until page number) whitespace (optional page number)
69      private static final Pattern HEADER = Pattern.compile("[\u2800\\s]+(⠼[" + BRAILLE_DIGITS + "]+|)([\u2800\\s]?⠤[\u2800\\s]?⠼[" + BRAILLE_DIGITS + "]+|)[\u2800\\s]+(⠼[" + BRAILLE_DIGITS + "]+|)");
70      private static final DateTimeFormatter DAY_MONTH_YEAR = DateTimeFormatter.ofPattern("d-M-Y");
71  
72      private final MemoizingProvider<Query,FileFormat> fileFormatProvider;
73  
74      private ReadablePipe source = null;
75      private WritablePipe result = null;
76      
77      private MetadataStep(XProcRuntime runtime, XAtomicStep step, MemoizingProvider<Query,FileFormat> fileFormatProvider) {
78          super(runtime, step);
79          this.fileFormatProvider = fileFormatProvider;
80      }
81  
82      @Override
83      public void setInput(String port, ReadablePipe pipe) {
84          source = pipe;
85      }
86  
87      @Override
88      public void setOutput(String port, WritablePipe pipe) {
89          result = pipe;
90      }
91  
92      @Override
93      public void reset() {
94          source.resetReader();
95          result.resetWriter();
96      }
97  
98      @Override
99      public void run() throws SaxonApiException {
100         super.run();
101 
102         try {
103 
104             XdmNode pef = source.read();
105 
106             /*
107             The XQuery is not used anymore. Instead, we use Java to identify
108             the page numbers from the header (the first line of each page).
109             This is for two reasons:
110             
111             1. easy parsing of headers, incl. print page numbers
112             2. possibility to adjust to unexpected numbering in PEF
113                in duplex mode, each section should start at an odd page
114                but Dotify seems to behave a little strange now and then
115                so we adjust the metadata to what Dotify produces
116             
117             */
118             // String xquery = getOption(_xquery, "");
119             String identifier = getOption(_identifier, "");
120             String brfFileFormat = getOption(_brf_file_format, "");
121             String brfNamePattern = getOption(_brf_name_pattern, "");
122             int brfNumberWidth = getOption(_brf_number_width, 0);
123             String optionalDate = getOption(_optional_date, "");
124 
125             String brfFileExtension = getFileExtension(brfFileFormat);
126 
127             /*
128             InputStream query = new URL(xquery).openConnection().getInputStream();
129             
130             XQueryCompiler xqCompiler = runtime.getProcessor().newXQueryCompiler();
131             XQueryExecutable xqExecutable = xqCompiler.compile(query);
132             XQueryEvaluator xqEvaluator = xqExecutable.load();
133 
134             xqEvaluator.setSource(pef.asSource());
135             xqEvaluator.setExternalVariable(_identifier, new XdmAtomicValue(identifier));
136             xqEvaluator.setExternalVariable(_brf_name_pattern, new XdmAtomicValue(brfNamePattern));
137             xqEvaluator.setExternalVariable(_brf_number_width, new XdmAtomicValue(brfNumberWidth));
138             xqEvaluator.setExternalVariable(_brf_file_extension, new XdmAtomicValue(brfFileExtension));
139             xqEvaluator.setExternalVariable(_optional_date, new XdmAtomicValue(optionalDate));
140 
141             XdmValue xqResult = xqEvaluator.evaluate();
142             
143             // get the first node from the result
144             XdmNode metadata = null;
145             for (XdmValue xqValue : xqResult) {
146                 if (xqValue instanceof XdmNode) {
147                     metadata = (XdmNode)xqValue;
148                     break;
149                 }
150             };
151             */
152             
153             Book book = parsePEF(pef);
154             String metadataXml = createMetadataXml(book, identifier, brfNamePattern, brfNumberWidth, brfFileExtension, optionalDate);
155             DocumentBuilder documentBuilder = runtime.getProcessor().newDocumentBuilder();
156             XdmNode metadata = documentBuilder.build(new StreamSource(new StringReader(metadataXml)));
157 
158             result.write(metadata);
159             
160         } catch (Exception e) {
161 
162             logger.error("dedicon:metadata failed", e);
163             throw new XProcException(step.getNode(), e);
164 
165         }
166     }
167 
168     private String getFileExtension (String fileFormatQuery) {
169         Query.MutableQuery q = mutableQuery(query(fileFormatQuery));
170         Iterable<FileFormat> fileFormats = fileFormatProvider.get(q);
171         String fileExtension = "";
172         for (FileFormat fileFormat : fileFormats) {
173             fileExtension = fileFormat.getFileExtension();
174             break;
175         }
176         return fileExtension;
177     }
178     
179     private Book parsePEF(XdmNode pef) {
180         Book book = new Book();
181         getChildren(pef, PEF_NAMESPACE, "pef").forEach(pefRoot -> {
182             getChildren(pefRoot, PEF_NAMESPACE, "body").forEach(pefBody -> {
183                 getChildren(pefBody, PEF_NAMESPACE, "volume").forEach(pefVolume -> {
184                     Volume volume = new Volume();
185                     book.getVolumes().add(volume);
186                     String pefDuplex = pefVolume.getAttributeValue(new QName("duplex"));
187                     volume.setDuplex("true".equalsIgnoreCase(pefDuplex));
188                     getChildren(pefVolume, PEF_NAMESPACE, "section").forEach(pefSection -> {
189                         Section section = new Section();
190                         volume.getSections().add(section);
191                         getChildren(pefSection, PEF_NAMESPACE, "page").forEach(pefPage -> {
192                             Page page = new Page();
193                             setPageNumbers(page, pefPage);
194                             section.getPages().add(page);
195                         });
196                     });
197                 });
198             });
199         });
200         determineVolumeMetadata(book);
201         return book;
202     }
203     
204     private void setPageNumbers(Page page, XdmNode pefPage) {
205         for(XdmNode pefRow : getChildren(pefPage, PEF_NAMESPACE, "row")) {
206             String header = pefRow.getStringValue();
207             setPageNumbers(page, header);
208             break;
209         }
210     }
211     
212     // this method should be removed in the future because
213     // newer versions of Saxon have a children() method in class XdmNode
214     private List<XdmNode> getChildren(XdmNode parent, String namespace, String child) {
215         List<XdmNode> children = new ArrayList<>();
216         XdmSequenceIterator iterator = parent.axisIterator(CHILD, new QName(namespace, child));
217         while (iterator.hasNext()) {
218             XdmItem item = iterator.next();
219             if (item instanceof XdmNode) {
220                 children.add((XdmNode)item);
221             }
222         }
223         return children;
224     }
225     
226     private void setPageNumbers(Page page, String header) {
227         Matcher pageNumbersMatcher = HEADER.matcher(header);
228         if (pageNumbersMatcher.find()) {
229             String fromPrintPageNumber = afterNumberSign(pageNumbersMatcher.group(1));
230             String untilPrintPageNumber = afterNumberSign(pageNumbersMatcher.group(2));
231             String pageNumber = afterNumberSign(pageNumbersMatcher.group(3));
232             
233             // if there is only one print page number, set the other one
234             if (fromPrintPageNumber.length() > 0 && untilPrintPageNumber.length() == 0) {
235                 untilPrintPageNumber = fromPrintPageNumber;
236             }
237             if (fromPrintPageNumber.length() == 0 && untilPrintPageNumber.length() > 0) {
238                 fromPrintPageNumber = untilPrintPageNumber;
239             }
240             
241             page.setFromPrintPageNumber(getNumber(fromPrintPageNumber));
242             page.setUntilPrintPageNumber(getNumber(untilPrintPageNumber));
243             page.setPageNumber(getNumber(pageNumber));
244         }
245         
246         // no page number found - do nothing
247     }
248     
249     private String afterNumberSign(String brailleNumber) {
250         int index = brailleNumber.indexOf('⠼');
251         if (index >= 0) {
252             return brailleNumber.substring(index + 1);
253         }
254         // no number sign found
255         return "";
256     }
257     
258     private Integer getNumber(String brailleNumber) {
259         if (brailleNumber.length() == 0) {
260             return null;
261         }
262         
263         int number = 0;
264         for (int i = 0; i < brailleNumber.length(); i++) {
265             char brailleDigit = brailleNumber.charAt(i);
266             int digit = BRAILLE_DIGITS.indexOf(brailleDigit);   // digit >= 0
267             number = number*10 + digit;
268         }
269         return number;
270     }
271     
272     private void determineVolumeMetadata(Book book) {
273         Volume previousVolume = null;
274         int expectedFirstPageNumber = 1;
275         for (Volume volume : book.getVolumes()) {
276             volume.setFirstPrintPageNumber(getFirstPrintPageNumber(volume));
277             volume.setLastPrintPageNumber(getLastPrintPageNumber(volume));
278             expectedFirstPageNumber = setPageNumbers(volume, expectedFirstPageNumber);
279             // adjust the last page number of the previous volume
280             if (previousVolume != null) {
281                 previousVolume.setLastPageNumber(volume.getFirstPageNumber() - 1);
282             }
283             previousVolume = volume;
284         }
285     }
286         
287     private Integer getFirstPrintPageNumber(Volume volume) {
288         return volume.getSections().stream()
289                 .map(Section::getPages)
290                 .flatMap(Collection::stream)
291                 .map(Page::getFromPrintPageNumber)
292                 .filter(Objects::nonNull)
293                 .findFirst()
294                 .orElse(null);
295     }
296         
297     // this will actually return the highest print page number, not the last one
298     // which is good, esp. in the case that the last volume ends with a TOC
299     // with out-of-range print page numbers
300     //
301     // another stategy would be to discard print page numbers lower than
302     // the highest one of the previous volume
303     private Integer getLastPrintPageNumber(Volume volume) {
304         return volume.getSections().stream()
305                 .map(Section::getPages)
306                 .flatMap(Collection::stream)
307                 .map(Page::getUntilPrintPageNumber)
308                 .filter(Objects::nonNull)
309                 .sorted(Collections.reverseOrder())
310                 .findFirst()
311                 .orElse(null);
312     }
313 
314     /**
315      * Sets the page numbers and returns the expected first page number of the next volume
316      * 
317      * @param volume
318      * @param expectedFirstPageNumber
319      * @return expected first page number of next volume
320      */
321     private int setPageNumbers(Volume volume, int expectedFirstPageNumber) {
322         volume.setFirstPageNumber(expectedFirstPageNumber);
323         volume.setLastPageNumber(expectedFirstPageNumber);
324         int lastPageNumber = expectedFirstPageNumber;
325         boolean pageNumberFound = false;
326         for (Section section : volume.getSections()) {
327             int pagesInThisSection = 0;
328             for (Page page : section.getPages()) {
329                 if (page.getPageNumber() != null) {
330                     if (!pageNumberFound) {
331                         // the first page number is adjusted
332                         volume.setFirstPageNumber(volume.getFirstPageNumber() + page.getPageNumber() - lastPageNumber);
333                         pageNumberFound = true;
334                     }
335                     lastPageNumber = page.getPageNumber();
336                 }
337                 volume.setLastPageNumber(lastPageNumber);
338                 lastPageNumber ++;
339                 pagesInThisSection ++;
340             }
341             
342             // add one in case of duplex mode and the section has an odd number of pages
343             if (volume.getDuplex()) {
344                 lastPageNumber += pagesInThisSection % 2;
345             }
346         }
347         
348         return lastPageNumber;
349     }
350     
351     private String createMetadataXml(Book book, String identifier, String brfNamePattern, int brfNumberWidth, String brfFileExtension, String optionalDate) throws SaxonApiException {
352         String date = optionalDate;
353         if (StringUtils.isBlank(date)) {
354             date = LocalDate.now().format(DAY_MONTH_YEAR);
355         }
356         StringBuilder xml = new StringBuilder();
357         xml.append("<lois_id>").append(identifier).append("</lois_id>");
358         int volumeIndex = 0;
359         for (Volume volume : book.getVolumes()) {
360             volumeIndex ++;
361             xml.append("<volume>");
362             xml.append("<filename>").append(getFilename(volumeIndex, brfNamePattern, brfNumberWidth, brfFileExtension)).append("</filename>");
363             xml.append("<vtype>br</vtype>");
364             xml.append("<volumenumber>").append(volumeIndex).append("</volumenumber>");
365             xml.append("<fromip>");
366             if (volume.getFirstPrintPageNumber() != null) {
367                 xml.append(volume.getFirstPrintPageNumber());
368             }
369             xml.append("</fromip>");
370             xml.append("<tillip>");
371             if (volume.getLastPrintPageNumber() != null) {
372                 xml.append(volume.getLastPrintPageNumber());
373             }
374             xml.append("</tillip>");
375             xml.append("<ippages>");
376             if (volume.getFirstPrintPageNumber() != null && volume.getLastPrintPageNumber() != null) {
377                 xml.append(volume.getLastPrintPageNumber() - volume.getFirstPrintPageNumber() + 1);
378             }
379             xml.append("</ippages>");
380             xml.append("<fromcp>");
381             if (volume.getFirstPageNumber() != null) {
382                 xml.append(volume.getFirstPageNumber());
383             }
384             xml.append("</fromcp>");
385             xml.append("<tillcp>");
386             if (volume.getLastPageNumber() != null) {
387                 xml.append(volume.getLastPageNumber());
388             }
389             xml.append("</tillcp>");
390             xml.append("<amount>");
391             if (volume.getFirstPageNumber() != null && volume.getLastPageNumber() != null) {
392                 xml.append(volume.getLastPageNumber() - volume.getFirstPageNumber() + 1);
393             }
394             xml.append("</amount>");
395             xml.append("<last>").append(volumeIndex == book.getVolumes().size() ? "Y" : "N").append("</last>");
396             xml.append("<vreadydate>").append(date).append("</vreadydate>");
397             xml.append("</volume>");          
398         }
399         String document = "<document>".concat(xml.toString()).concat("</document>");
400         return document;
401     }
402 
403     private String getFilename(int volumeIndex, String brfNamePattern, int brfNumberWidth, String brfFileExtension) {
404         String brfNumber = String.valueOf(volumeIndex);
405         while (brfNumber.length() < brfNumberWidth) {
406             brfNumber = "0" + brfNumber;
407         }
408         return brfNamePattern.replace("{}", brfNumber) + brfFileExtension;
409     }
410     
411     @Component(
412             name = "dedicon:metadata",
413             service = {XProcStepProvider.class},
414             property = {"type:String={http://www.dedicon.nl}metadata"}
415     )
416     public static class Provider implements XProcStepProvider {
417 
418         private List<FileFormatProvider> fileFormatProviders = new ArrayList<>();
419         private MemoizingProvider<Query,FileFormat> fileFormatProvider = memoize(dispatch(fileFormatProviders));
420 
421         @Override
422         public XProcStep newStep(XProcRuntime runtime, XAtomicStep step) {
423             return new MetadataStep(runtime, step, fileFormatProvider);
424         }
425 
426         @Reference(
427                 name = "FileFormatProvider",
428                 unbind = "unbindFileFormatProvider",
429                 service = FileFormatProvider.class,
430                 cardinality = ReferenceCardinality.MULTIPLE,
431                 policy = ReferencePolicy.DYNAMIC
432         )
433         protected void bindFileFormatProvider(FileFormatProvider provider) {
434                 fileFormatProviders.add(provider);
435         }
436 
437         protected void unbindFileFormatProvider(FileFormatProvider provider) {
438                 fileFormatProviders.remove(provider);
439                 this.fileFormatProvider.invalidateCache();
440         }
441     }
442 }