/*
 * Decompiled with CFR 0.152.
 */
package org.ow2.weblab.service.normaliser.tika;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import javax.jws.WebService;
import javax.xml.bind.DatatypeConverter;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.ow2.weblab.content.api.ContentManager;
import org.ow2.weblab.content.impl.FileContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.factory.AnnotationFactory;
import org.ow2.weblab.core.extended.util.ResourceUtil;
import org.ow2.weblab.core.helper.PoKHelper;
import org.ow2.weblab.core.helper.impl.JenaPoKHelper;
import org.ow2.weblab.core.model.Annotation;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.PieceOfKnowledge;
import org.ow2.weblab.core.model.Resource;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.core.services.Analyser;
import org.ow2.weblab.core.services.ContentNotAvailableException;
import org.ow2.weblab.core.services.InvalidParameterException;
import org.ow2.weblab.core.services.UnexpectedException;
import org.ow2.weblab.core.services.analyser.ProcessArgs;
import org.ow2.weblab.core.services.analyser.ProcessReturn;
import org.ow2.weblab.rdf.Value;
import org.ow2.weblab.service.normaliser.tika.Messages;
import org.ow2.weblab.service.normaliser.tika.TikaConfiguration;
import org.ow2.weblab.service.normaliser.tika.handlers.WebLabHandlerDecorator;
import org.ow2.weblab.service.normaliser.tika.metadatawriter.MetadataWriter;
import org.purl.dc.elements.DublinCoreAnnotator;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

@WebService(endpointInterface="org.ow2.weblab.core.services.Analyser")
public class TikaExtractorService
implements Analyser {
    protected final Log logger = LogFactory.getLog(this.getClass());
    protected final ContentManager contentManager;
    protected final TikaConfiguration serviceConfig;
    protected final TikaConfig tikaConfig;
    protected final boolean removeContent;
    protected final DateFormat simpleDateFormat;
    protected MetadataWriter metadataWriter;

    public TikaExtractorService(TikaConfiguration conf) throws TikaException, IOException {
        this.serviceConfig = conf;
        this.contentManager = ContentManager.getInstance();
        this.removeContent = !(this.contentManager.getReader() instanceof FileContentManager) && this.serviceConfig.isRemoveTempContent();
        this.simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
        if (this.serviceConfig.getPathToXmlConfigurationFile() == null) {
            this.logger.debug((Object)Messages.getString((String)"tika.message.debug.defaultTikaConf"));
            this.tikaConfig = new TikaConfig();
        } else {
            try {
                this.tikaConfig = new TikaConfig(this.getClass().getClassLoader().getResource(this.serviceConfig.getPathToXmlConfigurationFile()));
            }
            catch (SAXException e) {
                throw new IOException(e);
            }
        }
        if (this.contentManager == null) {
            this.logger.fatal((Object)Messages.getString((String)"tika.message.error.unableToLoadContentManager"));
            throw new IOException(Messages.getString((String)"tika.message.error.unableToLoadContentManager"));
        }
        if (!(this.tikaConfig.getParser() instanceof CompositeParser)) {
            this.logger.warn((Object)Messages.getString((String)"tika.message.warn.notACompositeParser.1", (Object[])new Object[]{this.tikaConfig.getParser().getClass().getCanonicalName()}));
        }
        try {
            this.metadataWriter = (MetadataWriter)conf.getMetadataWriterClass().newInstance();
        }
        catch (InstantiationException e) {
            this.logger.fatal((Object)"Failed to initialize the metadataWriter field");
            throw new IOException("Failed to initialize the metadataWriter field");
        }
        catch (IllegalAccessException e) {
            this.logger.fatal((Object)"Failed to initialize the metadataWriter field");
            throw new IOException("Failed to initialize the metadataWriter field");
        }
        this.logger.info((Object)Messages.getString((String)"tika.message.info.started"));
    }

    public ProcessReturn process(ProcessArgs args) throws InvalidParameterException, ContentNotAvailableException, UnexpectedException {
        this.logger.trace((Object)"Process method called.");
        Document document = this.checkArgs(args);
        this.logger.info((Object)Messages.getString((String)"tika.message.info.processCalled.1", (Object[])new Object[]{document.getUri()}));
        File file = this.getContent(document);
        Metadata extractedMeta_l = this.extractTextAndMetadata(document, file, false);
        if (ResourceUtil.getSelectedSubResources((Resource)document, Text.class).isEmpty()) {
            this.logger.warn((Object)Messages.getString((String)"tika.message.warn.noTextFound.2", (Object[])new Object[]{file.getAbsolutePath(), document.getUri()}));
            extractedMeta_l = this.extractTextAndMetadata(document, file, true);
        }
        if (this.serviceConfig.isAddMetadata()) {
            Annotation annot = AnnotationFactory.createAndLinkAnnotation((Resource)document);
            JenaPoKHelper ahe = new JenaPoKHelper((PieceOfKnowledge)annot);
            ahe.setAutoCommitMode(false);
            boolean addDCTPrefix = false;
            boolean addTikaPrefix = false;
            boolean addWLPPrefix = false;
            try {
                this.metadataWriter.write(extractedMeta_l, (PoKHelper)ahe, new URI(document.getUri()));
                if (this.serviceConfig.getServiceUri() != null) {
                    ahe.createResStat(annot.getUri(), "http://weblab.ow2.org/core/1.2/ontology/processing#isProducedBy", this.serviceConfig.getServiceUri());
                    ahe.createLitStat(annot.getUri(), "http://purl.org/dc/terms/created", DatatypeConverter.printDateTime((Calendar)Calendar.getInstance()));
                    ahe.setNSPrefix("dct", "http://purl.org/dc/terms/");
                    ahe.setNSPrefix("wlp", "http://weblab.ow2.org/core/1.2/ontology/processing#");
                } else {
                    if (addDCTPrefix) {
                        ahe.setNSPrefix("dct", "http://purl.org/dc/terms/");
                    }
                    if (addWLPPrefix) {
                        ahe.setNSPrefix("wlp", "http://weblab.ow2.org/core/1.2/ontology/processing#");
                    }
                }
                if (addTikaPrefix) {
                    ahe.setNSPrefix(this.serviceConfig.getUnmappedPropertiesPrefix(), this.serviceConfig.getUnmappedPropertiesBaseUri());
                }
                ahe.commit();
            }
            catch (URISyntaxException e) {
                this.logger.error((Object)("Document URI is not a valid : " + e.getLocalizedMessage()));
            }
        }
        if (this.removeContent && !file.delete()) {
            this.logger.warn((Object)Messages.getString((String)"tika.message.warn.unableToDeleteTemp.2", (Object[])new Object[]{file.getAbsolutePath(), document.getUri()}));
        }
        ProcessReturn pr = new ProcessReturn();
        pr.setResource((Resource)document);
        this.logger.info((Object)Messages.getString((String)"tika.message.info.endOfprocess.1", (Object[])new Object[]{document.getUri()}));
        return pr;
    }

    protected Document checkArgs(ProcessArgs args) throws InvalidParameterException {
        if (args == null) {
            String err = Messages.getString((String)"tika.message.error.processArgsNull");
            this.logger.error((Object)err);
            throw new InvalidParameterException(err, Messages.getString((String)"tika.message.error.invalidParameter"));
        }
        Resource res = args.getResource();
        if (res == null) {
            String err = Messages.getString((String)"tika.message.error.resourceNull");
            this.logger.error((Object)err);
            throw new InvalidParameterException(err, Messages.getString((String)"tika.message.error.invalidParameter"));
        }
        if (!(res instanceof Document)) {
            String err = Messages.getString((String)"tika.message.error.notADocument.2", (Object[])new Object[]{res.getUri(), res.getClass().getCanonicalName()});
            this.logger.error((Object)err);
            throw new InvalidParameterException(err, Messages.getString((String)"tika.message.error.invalidParameter"));
        }
        return (Document)res;
    }

    private File getContent(Document document) throws ContentNotAvailableException {
        File file;
        try {
            file = this.contentManager.readNativeContent((Resource)document);
        }
        catch (WebLabCheckedException wlce) {
            String err = Messages.getString((String)"tika.message.error.contentNotAvailable.1", (Object[])new Object[]{document.getUri()});
            this.logger.error((Object)err, (Throwable)wlce);
            throw new ContentNotAvailableException(err, Messages.getString((String)"tika.message.error.contentNotAvailableSimple"), (Throwable)wlce);
        }
        if (!file.exists()) {
            String err = Messages.getString((String)"tika.message.error.contentFileNotFound.2", (Object[])new Object[]{file.getAbsolutePath(), document.getUri()});
            this.logger.error((Object)err);
            throw new ContentNotAvailableException(err, Messages.getString((String)"tika.message.error.contentNotAvailableSimple"));
        }
        if (!file.canRead()) {
            String err = Messages.getString((String)"tika.message.error.contentFileNotReadable.2", (Object[])new Object[]{file.getAbsolutePath(), document.getUri()});
            this.logger.error((Object)err);
            throw new ContentNotAvailableException(err, Messages.getString((String)"tika.message.error.contentNotAvailableSimple"));
        }
        return file;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected Metadata extractTextAndMetadata(Document document, File contentFile, boolean forceAutoDetectParser) throws UnexpectedException, ContentNotAvailableException {
        FileInputStream stream;
        WebLabHandlerDecorator handler;
        File xhtmlOutputFile;
        AutoDetectParser parser;
        String mimeType;
        if (forceAutoDetectParser) {
            mimeType = null;
        } else {
            Value format = new DublinCoreAnnotator((Resource)document).readFormat();
            if (format != null && format.hasValue()) {
                mimeType = (String)format.firstTypedValue();
                if (format.getValues().size() > 1) {
                    this.logger.warn((Object)Messages.getString((String)"tika.message.warn.moreThanOneType.2", (Object[])new Object[]{document.getUri(), mimeType}));
                }
            } else {
                mimeType = null;
            }
            this.logger.debug((Object)("Mime type detected in Resource: " + mimeType));
        }
        if (mimeType == null) {
            parser = new AutoDetectParser(this.tikaConfig);
        } else if (this.tikaConfig.getParser() instanceof CompositeParser) {
            CompositeParser composite = (CompositeParser)this.tikaConfig.getParser();
            MediaType mediaType = MediaType.parse((String)mimeType);
            if (composite.getParsers().containsKey(mediaType)) {
                parser = (Parser)composite.getParsers().get(mediaType);
            } else {
                this.logger.debug((Object)("No parser for type " + mediaType + " let Tika guess type."));
                parser = new AutoDetectParser(this.tikaConfig);
            }
        } else {
            parser = this.tikaConfig.getParser();
            this.logger.debug((Object)("Tika Config does not use an AutodetectParser but a " + parser.getClass().getCanonicalName() + "."));
        }
        ProfilingHandler langGuesser = new ProfilingHandler();
        boolean generateHtml = this.serviceConfig.isGenerateHtml();
        try {
            xhtmlOutputFile = File.createTempFile("tika", ".xhtml");
        }
        catch (IOException ioe) {
            this.logger.warn((Object)Messages.getString((String)"tika.message.warn.unableToCreateTempFile.1", (Object[])new Object[]{document.getUri()}), (Throwable)ioe);
            xhtmlOutputFile = new File(FileUtils.getTempDirectory(), "noFile");
            generateHtml = false;
        }
        if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang() && generateHtml) {
            this.logger.trace((Object)"Create a TeeContentHandler for language guesser, MediaUnit creation and XHTML output creation.");
            try {
                handler = new TeeContentHandler(new ContentHandler[]{this.getMUCreatorCHandler(document), langGuesser, this.getHtmlCreatorCHandler(xhtmlOutputFile)});
            }
            catch (TransformerConfigurationException tce) {
                this.logger.warn((Object)Messages.getString((String)"tika.message.warn.unableToCreateTransformer.1", (Object[])new Object[]{document.getUri()}), (Throwable)tce);
                generateHtml = false;
                handler = new TeeContentHandler(new ContentHandler[]{this.getMUCreatorCHandler(document), langGuesser});
            }
        } else if (generateHtml) {
            this.logger.trace((Object)"Create a TeeContentHandler for MediaUnit creation and XHTML output creation.");
            try {
                handler = new TeeContentHandler(new ContentHandler[]{this.getMUCreatorCHandler(document), this.getHtmlCreatorCHandler(xhtmlOutputFile)});
            }
            catch (TransformerConfigurationException tce) {
                this.logger.warn((Object)Messages.getString((String)"tika.message.warn.unableToCreateTransformer.1", (Object[])new Object[]{document.getUri()}), (Throwable)tce);
                generateHtml = false;
                handler = new TeeContentHandler(new ContentHandler[]{this.getMUCreatorCHandler(document)});
            }
        } else if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang()) {
            this.logger.trace((Object)"Create a TeeContentHandler for language guesser and MediaUnit creation.");
            handler = new TeeContentHandler(new ContentHandler[]{this.getMUCreatorCHandler(document), langGuesser});
        } else {
            this.logger.trace((Object)"Create a ContentHandler for MediaUnit creation.");
            handler = this.getMUCreatorCHandler(document);
        }
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        try {
            stream = new FileInputStream(contentFile);
        }
        catch (FileNotFoundException fnfe) {
            String err = Messages.getString((String)"tika.message.error.contentFileNotFound.2", (Object[])new Object[]{contentFile.getAbsolutePath(), document.getUri()});
            this.logger.error((Object)err);
            throw new ContentNotAvailableException(err, Messages.getString((String)"tika.message.error.contentNotAvailableSimple"));
        }
        this.logger.debug((Object)("Start parsing " + contentFile.getPath() + " for document " + document.getUri() + "."));
        try {
            parser.parse((InputStream)stream, (ContentHandler)handler, metadata, context);
        }
        catch (IOException ioe) {
            String err = Messages.getString((String)"tika.message.error.ioeOnContent.2", (Object[])new Object[]{contentFile.getPath(), document.getUri()});
            this.logger.error((Object)err, (Throwable)ioe);
            throw new UnexpectedException(err, Messages.getString((String)"tika.message.error.ioeOnContentSimple"), (Throwable)ioe);
        }
        catch (SAXException saxe) {
            String err = Messages.getString((String)"tika.message.error.saxeOnContent.2", (Object[])new Object[]{contentFile.getPath(), document.getUri()});
            this.logger.error((Object)err, (Throwable)saxe);
            throw new UnexpectedException(err, Messages.getString((String)"tika.message.error.errorOnContentSimple"), (Throwable)saxe);
        }
        catch (TikaException te) {
            String err = Messages.getString((String)"tika.message.error.tikaExOnContent.2", (Object[])new Object[]{contentFile.getPath(), document.getUri()});
            this.logger.error((Object)err, (Throwable)te);
            throw new UnexpectedException(err, Messages.getString((String)"tika.message.error.errorOnContentSimple"), (Throwable)te);
        }
        finally {
            IOUtils.closeQuietly((InputStream)stream);
        }
        this.logger.debug((Object)("Finished parsing " + contentFile.getPath() + " for document " + document.getUri() + "."));
        if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang() && langGuesser.getLanguage().isReasonablyCertain()) {
            metadata.set("language", langGuesser.getLanguage().getLanguage());
        } else if (this.serviceConfig.isAnnotateDocumentWithLang() && this.serviceConfig.getDefaultLang() != null) {
            metadata.set("language", this.serviceConfig.getDefaultLang());
        }
        if (generateHtml) {
            if (!xhtmlOutputFile.exists()) {
                this.logger.warn((Object)Messages.getString((String)"tika.message.warn.noOutputFile.2", (Object[])new Object[]{xhtmlOutputFile.getPath(), document.getUri()}));
            } else if (FileUtils.sizeOf((File)xhtmlOutputFile) <= 0L) {
                this.logger.warn((Object)Messages.getString((String)"tika.message.warn.emptyOutputFile.2", (Object[])new Object[]{xhtmlOutputFile.getPath(), document.getUri()}));
            } else {
                try {
                    FileInputStream fis = new FileInputStream(xhtmlOutputFile);
                    this.logger.debug((Object)("Save normalised content file: " + xhtmlOutputFile));
                    try {
                        this.contentManager.writeNormalisedContent((InputStream)fis, (Resource)document);
                    }
                    catch (WebLabCheckedException wlce) {
                        this.logger.warn((Object)Messages.getString((String)"tika.message.warn.errorSavingNormalised.2", (Object[])new Object[]{xhtmlOutputFile.getPath(), document.getUri()}), (Throwable)wlce);
                    }
                    finally {
                        IOUtils.closeQuietly((InputStream)fis);
                    }
                }
                catch (FileNotFoundException fnfe) {
                    this.logger.warn((Object)Messages.getString((String)"tika.message.warn.noOutputFile.2", (Object[])new Object[]{xhtmlOutputFile.getPath(), document.getUri()}), (Throwable)fnfe);
                }
            }
        }
        FileUtils.deleteQuietly((File)xhtmlOutputFile);
        if (this.serviceConfig.isAddMetadata()) {
            return metadata;
        }
        return new Metadata();
    }

    private WebLabHandlerDecorator getMUCreatorCHandler(Document document) throws UnexpectedException {
        WebLabHandlerDecorator wlhd;
        try {
            wlhd = (WebLabHandlerDecorator)this.serviceConfig.getWebLabHandlerDecoratorClass().newInstance();
        }
        catch (InstantiationException ie) {
            String err = Messages.getString((String)"tika.message.error.badHandler.1", (Object[])new Object[]{this.serviceConfig.getWebLabHandlerDecoratorClass().getCanonicalName()});
            this.logger.error((Object)err, (Throwable)ie);
            throw new UnexpectedException(err, err, (Throwable)ie);
        }
        catch (IllegalAccessException iae) {
            String err = Messages.getString((String)"tika.message.error.badHandler.1", (Object[])new Object[]{this.serviceConfig.getWebLabHandlerDecoratorClass().getCanonicalName()});
            this.logger.error((Object)err, (Throwable)iae);
            throw new UnexpectedException(err, err, (Throwable)iae);
        }
        wlhd.setDocument(document);
        wlhd.setTikaConfiguration(this.serviceConfig);
        wlhd.setContentHandler((ContentHandler)new BodyContentHandler(-1));
        return wlhd;
    }

    private ContentHandler getHtmlCreatorCHandler(File xhtmlFile) throws TransformerConfigurationException {
        SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty("method", "xml");
        handler.getTransformer().setOutputProperty("indent", "yes");
        handler.setResult(new StreamResult(xhtmlFile));
        return handler;
    }
}

