1   package nl.dedicon.pipeline.braille.calabash.impl;
2   
3   import com.xmlcalabash.core.XProcException;
4   import com.xmlcalabash.core.XProcRuntime;
5   import com.xmlcalabash.core.XProcStep;
6   import com.xmlcalabash.io.ReadablePipe;
7   import com.xmlcalabash.io.WritablePipe;
8   import com.xmlcalabash.library.DefaultStep;
9   import com.xmlcalabash.runtime.XAtomicStep;
10  import java.io.StringReader;
11  import java.time.LocalDate;
12  import java.time.format.DateTimeFormatter;
13  import java.util.ArrayList;
14  import java.util.Collection;
15  import java.util.Collections;
16  import java.util.List;
17  import java.util.Objects;
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  import javax.xml.transform.stream.StreamSource;
21  import static net.sf.saxon.s9api.Axis.CHILD;
22  import net.sf.saxon.s9api.DocumentBuilder;
23  import net.sf.saxon.s9api.QName;
24  import net.sf.saxon.s9api.SaxonApiException;
25  import net.sf.saxon.s9api.XdmItem;
26  import net.sf.saxon.s9api.XdmNode;
27  import net.sf.saxon.s9api.XdmSequenceIterator;
28  import nl.dedicon.pipeline.braille.model.Book;
29  import nl.dedicon.pipeline.braille.model.Page;
30  import nl.dedicon.pipeline.braille.model.Section;
31  import nl.dedicon.pipeline.braille.model.Volume;
32  import org.daisy.braille.api.embosser.FileFormat;
33  import org.daisy.common.xproc.calabash.XProcStepProvider;
34  import org.daisy.pipeline.braille.common.Provider.util.MemoizingProvider;
35  import static org.daisy.pipeline.braille.common.Provider.util.dispatch;
36  import static org.daisy.pipeline.braille.common.Provider.util.memoize;
37  import org.daisy.pipeline.braille.common.Query;
38  import static org.daisy.pipeline.braille.common.Query.util.mutableQuery;
39  import static org.daisy.pipeline.braille.common.Query.util.query;
40  import org.daisy.pipeline.braille.pef.FileFormatProvider;
41  import org.osgi.service.component.annotations.Component;
42  import org.osgi.service.component.annotations.Reference;
43  import org.osgi.service.component.annotations.ReferenceCardinality;
44  import org.osgi.service.component.annotations.ReferencePolicy;
45  import org.slf4j.Logger;
46  import org.slf4j.LoggerFactory;
47  
48  /**
49   * XProc step for metadata
50   * 
51   * @author Paul Rambags
52   */
53  public class MetadataStep extends DefaultStep {
54  
55      private static final Logger logger = LoggerFactory.getLogger(MetadataStep.class);
56  
57      private static final QName _xquery = new QName("xquery");
58      private static final QName _identifier = new QName("identifier");
59      private static final QName _brf_file_extension = new QName("brf-file-extension");
60      private static final QName _brf_file_format = new QName("brf-file-format");
61      private static final QName _brf_name_pattern = new QName("brf-name-pattern");
62      private static final QName _brf_number_width = new QName("brf-number-width");
63      private static final QName _optional_date = new QName("optional-date");
64  
65      private static final String PEF_NAMESPACE = "http://www.daisy.org/ns/2008/pef";
66      private static final String BRAILLE_DIGITS = "⠚⠁⠃⠉⠙⠑⠋⠛⠓⠊";
67      // whitespace (optional from print number)(optional until page number) whitespace (optional page number)
68      private static final Pattern HEADER = Pattern.compile("[\u2800\\s]+(⠼[" + BRAILLE_DIGITS + "]+|)([\u2800\\s]?⠤[\u2800\\s]?⠼[" + BRAILLE_DIGITS + "]+|)[\u2800\\s]+(⠼[" + BRAILLE_DIGITS + "]+|)");
69      private static final DateTimeFormatter DAY_MONTH_YEAR = DateTimeFormatter.ofPattern("d-M-Y");
70  
71      private final MemoizingProvider<Query,FileFormat> fileFormatProvider;
72  
73      private ReadablePipe source = null;
74      private WritablePipe result = null;
75      
76      private MetadataStep(XProcRuntime runtime, XAtomicStep step, MemoizingProvider<Query,FileFormat> fileFormatProvider) {
77          super(runtime, step);
78          this.fileFormatProvider = fileFormatProvider;
79      }
80  
81      @Override
82      public void setInput(String port, ReadablePipe pipe) {
83          source = pipe;
84      }
85  
86      @Override
87      public void setOutput(String port, WritablePipe pipe) {
88          result = pipe;
89      }
90  
91      @Override
92      public void reset() {
93          source.resetReader();
94          result.resetWriter();
95      }
96  
97      @Override
98      public void run() throws SaxonApiException {
99          super.run();
100 
101         try {
102 
103             XdmNode pef = source.read();
104 
105             /*
106             The XQuery is not used anymore. Instead, we use Java to identify
107             the page numbers from the header (the first line of each page).
108             This is for two reasons:
109             
110             1. easy parsing of headers, incl. print page numbers
111             2. possibility to adjust to unexpected numbering in PEF
112                in duplex mode, each section should start at an odd page
113                but Dotify seems to behave a little strange now and then
114                so we adjust the metadata to what Dotify produces
115             
116             */
117             // String xquery = getOption(_xquery, "");
118             String identifier = getOption(_identifier, "");
119             String brfFileFormat = getOption(_brf_file_format, "");
120             String brfNamePattern = getOption(_brf_name_pattern, "");
121             int brfNumberWidth = getOption(_brf_number_width, 0);
122             String optionalDate = getOption(_optional_date, "");
123 
124             String brfFileExtension = getFileExtension(brfFileFormat);
125 
126             /*
127             InputStream query = new URL(xquery).openConnection().getInputStream();
128             
129             XQueryCompiler xqCompiler = runtime.getProcessor().newXQueryCompiler();
130             XQueryExecutable xqExecutable = xqCompiler.compile(query);
131             XQueryEvaluator xqEvaluator = xqExecutable.load();
132 
133             xqEvaluator.setSource(pef.asSource());
134             xqEvaluator.setExternalVariable(_identifier, new XdmAtomicValue(identifier));
135             xqEvaluator.setExternalVariable(_brf_name_pattern, new XdmAtomicValue(brfNamePattern));
136             xqEvaluator.setExternalVariable(_brf_number_width, new XdmAtomicValue(brfNumberWidth));
137             xqEvaluator.setExternalVariable(_brf_file_extension, new XdmAtomicValue(brfFileExtension));
138             xqEvaluator.setExternalVariable(_optional_date, new XdmAtomicValue(optionalDate));
139 
140             XdmValue xqResult = xqEvaluator.evaluate();
141             
142             // get the first node from the result
143             XdmNode metadata = null;
144             for (XdmValue xqValue : xqResult) {
145                 if (xqValue instanceof XdmNode) {
146                     metadata = (XdmNode)xqValue;
147                     break;
148                 }
149             };
150             */
151             
152             Book book = parsePEF(pef);
153             String metadataXml = createMetadataXml(book, identifier, brfNamePattern, brfNumberWidth, brfFileExtension, optionalDate);
154             DocumentBuilder documentBuilder = runtime.getProcessor().newDocumentBuilder();
155             XdmNode metadata = documentBuilder.build(new StreamSource(new StringReader(metadataXml)));
156 
157             result.write(metadata);
158             
159         } catch (Exception e) {
160 
161             logger.error("dedicon:metadata failed", e);
162             throw new XProcException(step.getNode(), e);
163 
164         }
165     }
166 
167     private String getFileExtension (String fileFormatQuery) {
168         Query.MutableQuery q = mutableQuery(query(fileFormatQuery));
169         Iterable<FileFormat> fileFormats = fileFormatProvider.get(q);
170         String fileExtension = "";
171         for (FileFormat fileFormat : fileFormats) {
172             fileExtension = fileFormat.getFileExtension();
173             break;
174         }
175         return fileExtension;
176     }
177     
178     private Book parsePEF(XdmNode pef) {
179         Book book = new Book();
180         getChildren(pef, PEF_NAMESPACE, "pef").forEach(pefRoot -> {
181             getChildren(pefRoot, PEF_NAMESPACE, "body").forEach(pefBody -> {
182                 getChildren(pefBody, PEF_NAMESPACE, "volume").forEach(pefVolume -> {
183                     Volume volume = new Volume();
184                     book.getVolumes().add(volume);
185                     String pefDuplex = pefVolume.getAttributeValue(new QName("duplex"));
186                     volume.setDuplex("true".equalsIgnoreCase(pefDuplex));
187                     getChildren(pefVolume, PEF_NAMESPACE, "section").forEach(pefSection -> {
188                         Section section = new Section();
189                         volume.getSections().add(section);
190                         getChildren(pefSection, PEF_NAMESPACE, "page").forEach(pefPage -> {
191                             Page page = new Page();
192                             setPageNumbers(page, pefPage);
193                             section.getPages().add(page);
194                         });
195                     });
196                 });
197             });
198         });
199         determineVolumeMetadata(book);
200         return book;
201     }
202     
203     private void setPageNumbers(Page page, XdmNode pefPage) {
204         for(XdmNode pefRow : getChildren(pefPage, PEF_NAMESPACE, "row")) {
205             String header = pefRow.getStringValue();
206             setPageNumbers(page, header);
207             break;
208         }
209     }
210     
211     // this method should be removed in the future because
212     // newer versions of Saxon have a children() method in class XdmNode
213     private List<XdmNode> getChildren(XdmNode parent, String namespace, String child) {
214         List<XdmNode> children = new ArrayList<>();
215         XdmSequenceIterator iterator = parent.axisIterator(CHILD, new QName(namespace, child));
216         while (iterator.hasNext()) {
217             XdmItem item = iterator.next();
218             if (item instanceof XdmNode) {
219                 children.add((XdmNode)item);
220             }
221         }
222         return children;
223     }
224     
225     private void setPageNumbers(Page page, String header) {
226         Matcher pageNumbersMatcher = HEADER.matcher(header);
227         if (pageNumbersMatcher.find()) {
228             String fromPrintPageNumber = afterNumberSign(pageNumbersMatcher.group(1));
229             String untilPrintPageNumber = afterNumberSign(pageNumbersMatcher.group(2));
230             String pageNumber = afterNumberSign(pageNumbersMatcher.group(3));
231             
232             // if there is only one print page number, set the other one
233             if (fromPrintPageNumber.length() > 0 && untilPrintPageNumber.length() == 0) {
234                 untilPrintPageNumber = fromPrintPageNumber;
235             }
236             if (fromPrintPageNumber.length() == 0 && untilPrintPageNumber.length() > 0) {
237                 fromPrintPageNumber = untilPrintPageNumber;
238             }
239             
240             page.setFromPrintPageNumber(getNumber(fromPrintPageNumber));
241             page.setUntilPrintPageNumber(getNumber(untilPrintPageNumber));
242             page.setPageNumber(getNumber(pageNumber));
243         }
244         
245         // no page number found - do nothing
246     }
247     
248     private String afterNumberSign(String brailleNumber) {
249         int index = brailleNumber.indexOf('⠼');
250         if (index >= 0) {
251             return brailleNumber.substring(index + 1);
252         }
253         // no number sign found
254         return "";
255     }
256     
257     private Integer getNumber(String brailleNumber) {
258         if (brailleNumber.length() == 0) {
259             return null;
260         }
261         
262         int number = 0;
263         for (int i = 0; i < brailleNumber.length(); i++) {
264             char brailleDigit = brailleNumber.charAt(i);
265             int digit = BRAILLE_DIGITS.indexOf(brailleDigit);   // digit >= 0
266             number = number*10 + digit;
267         }
268         return number;
269     }
270     
271     private void determineVolumeMetadata(Book book) {
272         Volume previousVolume = null;
273         int expectedFirstPageNumber = 1;
274         for (Volume volume : book.getVolumes()) {
275             volume.setFirstPrintPageNumber(getFirstPrintPageNumber(volume));
276             volume.setLastPrintPageNumber(getLastPrintPageNumber(volume));
277             expectedFirstPageNumber = setPageNumbers(volume, expectedFirstPageNumber);
278             // adjust the last page number of the previous volume
279             if (previousVolume != null) {
280                 previousVolume.setLastPageNumber(volume.getFirstPageNumber() - 1);
281             }
282             previousVolume = volume;
283         }
284     }
285         
286     private Integer getFirstPrintPageNumber(Volume volume) {
287         return volume.getSections().stream()
288                 .map(Section::getPages)
289                 .flatMap(Collection::stream)
290                 .map(Page::getFromPrintPageNumber)
291                 .filter(Objects::nonNull)
292                 .findFirst()
293                 .orElse(null);
294     }
295         
296     // this will actually return the highest print page number, not the last one
297     // which is good, esp. in the case that the last volume ends with a TOC
298     // with out-of-range print page numbers
299     //
300     // another stategy would be to discard print page numbers lower than
301     // the highest one of the previous volume
302     private Integer getLastPrintPageNumber(Volume volume) {
303         return volume.getSections().stream()
304                 .map(Section::getPages)
305                 .flatMap(Collection::stream)
306                 .map(Page::getUntilPrintPageNumber)
307                 .filter(Objects::nonNull)
308                 .sorted(Collections.reverseOrder())
309                 .findFirst()
310                 .orElse(null);
311     }
312 
313     /**
314      * Sets the page numbers and returns the expected first page number of the next volume
315      * 
316      * @param volume
317      * @param expectedFirstPageNumber
318      * @return expected first page number of next volume
319      */
320     private int setPageNumbers(Volume volume, int expectedFirstPageNumber) {
321         volume.setFirstPageNumber(expectedFirstPageNumber);
322         volume.setLastPageNumber(expectedFirstPageNumber);
323         int lastPageNumber = expectedFirstPageNumber;
324         boolean pageNumberFound = false;
325         for (Section section : volume.getSections()) {
326             int pagesInThisSection = 0;
327             for (Page page : section.getPages()) {
328                 if (page.getPageNumber() != null) {
329                     if (!pageNumberFound) {
330                         // the first page number is adjusted
331                         volume.setFirstPageNumber(volume.getFirstPageNumber() + page.getPageNumber() - lastPageNumber);
332                         pageNumberFound = true;
333                     }
334                     lastPageNumber = page.getPageNumber();
335                 }
336                 volume.setLastPageNumber(lastPageNumber);
337                 lastPageNumber ++;
338                 pagesInThisSection ++;
339             }
340             
341             // add one in case of duplex mode and the section has an odd number of pages
342             if (volume.getDuplex()) {
343                 lastPageNumber += pagesInThisSection % 2;
344             }
345         }
346         
347         return lastPageNumber;
348     }
349     
350     private String createMetadataXml(Book book, String identifier, String brfNamePattern, int brfNumberWidth, String brfFileExtension, String optionalDate) throws SaxonApiException {
351         String date = optionalDate;
352         if (date == null || date.length() == 0) {
353             date = LocalDate.now().format(DAY_MONTH_YEAR);
354         }
355         StringBuilder xml = new StringBuilder();
356         xml.append("<lois_id>").append(identifier).append("</lois_id>");
357         int volumeIndex = 0;
358         for (Volume volume : book.getVolumes()) {
359             volumeIndex ++;
360             xml.append("<volume>");
361             xml.append("<filename>").append(getFilename(volumeIndex, brfNamePattern, brfNumberWidth, brfFileExtension)).append("</filename>");
362             xml.append("<vtype>br</vtype>");
363             xml.append("<volumenumber>").append(volumeIndex).append("</volumenumber>");
364             xml.append("<fromip>");
365             if (volume.getFirstPrintPageNumber() != null) {
366                 xml.append(volume.getFirstPrintPageNumber());
367             }
368             xml.append("</fromip>");
369             xml.append("<tillip>");
370             if (volume.getLastPrintPageNumber() != null) {
371                 xml.append(volume.getLastPrintPageNumber());
372             }
373             xml.append("</tillip>");
374             xml.append("<ippages>");
375             if (volume.getFirstPrintPageNumber() != null && volume.getLastPrintPageNumber() != null) {
376                 xml.append(volume.getLastPrintPageNumber() - volume.getFirstPrintPageNumber() + 1);
377             }
378             xml.append("</ippages>");
379             xml.append("<fromcp>");
380             if (volume.getFirstPageNumber() != null) {
381                 xml.append(volume.getFirstPageNumber());
382             }
383             xml.append("</fromcp>");
384             xml.append("<tillcp>");
385             if (volume.getLastPageNumber() != null) {
386                 xml.append(volume.getLastPageNumber());
387             }
388             xml.append("</tillcp>");
389             xml.append("<amount>");
390             if (volume.getFirstPageNumber() != null && volume.getLastPageNumber() != null) {
391                 xml.append(volume.getLastPageNumber() - volume.getFirstPageNumber() + 1);
392             }
393             xml.append("</amount>");
394             xml.append("<last>").append(volumeIndex == book.getVolumes().size() ? "Y" : "N").append("</last>");
395             xml.append("<vreadydate>").append(date).append("</vreadydate>");
396             xml.append("</volume>");          
397         }
398         String document = "<document>".concat(xml.toString()).concat("</document>");
399         return document;
400     }
401 
402     private String getFilename(int volumeIndex, String brfNamePattern, int brfNumberWidth, String brfFileExtension) {
403         String brfNumber = String.valueOf(volumeIndex);
404         while (brfNumber.length() < brfNumberWidth) {
405             brfNumber = "0" + brfNumber;
406         }
407         return brfNamePattern.replace("{}", brfNumber) + brfFileExtension;
408     }
409     
410     @Component(
411             name = "dedicon:metadata",
412             service = {XProcStepProvider.class},
413             property = {"type:String={http://www.dedicon.nl}metadata"}
414     )
415     public static class Provider implements XProcStepProvider {
416 
417         private List<FileFormatProvider> fileFormatProviders = new ArrayList<>();
418         private MemoizingProvider<Query,FileFormat> fileFormatProvider = memoize(dispatch(fileFormatProviders));
419 
420         @Override
421         public XProcStep newStep(XProcRuntime runtime, XAtomicStep step) {
422             return new MetadataStep(runtime, step, fileFormatProvider);
423         }
424 
425         @Reference(
426                 name = "FileFormatProvider",
427                 unbind = "unbindFileFormatProvider",
428                 service = FileFormatProvider.class,
429                 cardinality = ReferenceCardinality.MULTIPLE,
430                 policy = ReferencePolicy.DYNAMIC
431         )
432         protected void bindFileFormatProvider(FileFormatProvider provider) {
433                 fileFormatProviders.add(provider);
434         }
435 
436         protected void unbindFileFormatProvider(FileFormatProvider provider) {
437                 fileFormatProviders.remove(provider);
438                 this.fileFormatProvider.invalidateCache();
439         }
440     }
441 }