/*
 * Decompiled with CFR 0.152.
 */
package org.ow2.weblab.services.normaliser.tika;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
import javax.jws.WebService;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.ow2.weblab.content.ContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.exception.WebLabUncheckedException;
import org.ow2.weblab.core.extended.factory.AnnotationFactory;
import org.ow2.weblab.core.extended.jaxb.XMLStringCleaner;
import org.ow2.weblab.core.extended.properties.PropertiesLoader;
import org.ow2.weblab.core.extended.util.ResourceUtil;
import org.ow2.weblab.core.helper.PoKHelperExtended;
import org.ow2.weblab.core.helper.RDFHelperFactory;
import org.ow2.weblab.core.helper.ResourceHelper;
import org.ow2.weblab.core.model.Annotation;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.PieceOfKnowledge;
import org.ow2.weblab.core.model.Resource;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.core.services.AccessDeniedException;
import org.ow2.weblab.core.services.Analyser;
import org.ow2.weblab.core.services.ContentNotAvailableException;
import org.ow2.weblab.core.services.InsufficientResourcesException;
import org.ow2.weblab.core.services.InvalidParameterException;
import org.ow2.weblab.core.services.ServiceNotConfiguredException;
import org.ow2.weblab.core.services.UnexpectedException;
import org.ow2.weblab.core.services.UnsupportedRequestException;
import org.ow2.weblab.core.services.analyser.ProcessArgs;
import org.ow2.weblab.core.services.analyser.ProcessReturn;
import org.ow2.weblab.services.normaliser.tika.MediaUnitContentHandler;
import org.springframework.core.io.ClassPathResource;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/*
 * Exception performing whole class analysis ignored.
 */
@WebService(endpointInterface="org.ow2.weblab.core.services.Analyser")
public class TikaExtractorService
implements Analyser {
    private static final Log logger = LogFactory.getLog(TikaExtractorService.class);
    public static final String CONFIG_FILE = "tika-service.properties";
    public static final String BASE_URI_PROPERTY_NAME = "baseUri";
    public static final String REMOVE_COTNENT_PROPERTY_NAME = "removeContent";
    public static final String OVERRIDE_METADATA_PROPERTY_NAME = "overrideMetadata";
    public static final String XHTML_FOLDER_PROPERTY_NAME = "xhtmlFolder";
    public static final String XHTML_SAVE = "saveXhtml";
    protected ContentManager contentManager = ContentManager.getInstance();
    private static String baseUri = "http://weblab.eads.com/service/format/tika/";
    private static final String BASE_PREFIX = "tika";
    private static boolean removeContent = true;
    private static boolean overrideMetadata = false;
    private static final boolean GENERATE_HTML = false;
    private static final String NO_MIME_TYPE_DETECTED_PROPERTY = "NO_MIME_TYPE";
    private static List<String> DATE_PREDS;

    public TikaExtractorService() {
        if (this.contentManager == null) {
            throw new WebLabUncheckedException("Unable to load required properties file for content management.");
        }
    }

    public ProcessReturn process(ProcessArgs args) throws AccessDeniedException, ContentNotAvailableException, InsufficientResourcesException, InvalidParameterException, ServiceNotConfiguredException, UnexpectedException, UnsupportedRequestException {
        File file;
        Document document = TikaExtractorService.checkArgs((ProcessArgs)args);
        logger.info((Object)("Process the document " + document.getUri() + "."));
        try {
            file = this.contentManager.readNativeContent((Resource)document);
        }
        catch (WebLabCheckedException wlce) {
            throw new ContentNotAvailableException("Tika service is unable to retrieve native content.", "Unable to retrieve content.");
        }
        HashMap toAnnot = new HashMap();
        ArrayList<String> values = new ArrayList<String>();
        values.add(String.valueOf(file.length()) + " bytes");
        toAnnot.put("http://purl.org/dc/terms/extent", values);
        TikaExtractorService.extractTextAndMetadata((Document)document, (File)file, toAnnot, (boolean)false);
        if (ResourceUtil.getSelectedSubResources((Resource)document, Text.class).size() == 0) {
            logger.info((Object)"No texte unit extracted from document. Try to extract once more with auto detect parser");
            TikaExtractorService.extractTextAndMetadata((Document)document, (File)file, toAnnot, (boolean)true);
        }
        if (removeContent && !file.delete()) {
            logger.warn((Object)("Unable to delete temp file." + file.getAbsolutePath()));
        }
        TikaExtractorService.annotate((Document)document, toAnnot);
        ProcessReturn pr = new ProcessReturn();
        pr.setResource((Resource)document);
        logger.info((Object)("End of processing " + document.getUri() + " in Tika extractor"));
        return pr;
    }

    protected static void annotate(Document document, Map<String, List<String>> toAnnot) {
        if (!toAnnot.isEmpty()) {
            Annotation annot = AnnotationFactory.createAndLinkAnnotation((Resource)document);
            PoKHelperExtended ahe = RDFHelperFactory.getPoKHelperExtended((PieceOfKnowledge)annot);
            ResourceHelper rh = RDFHelperFactory.getResourceHelper((Resource)document);
            ahe.setAutoCommitMode(false);
            for (Map.Entry<String, List<String>> entry : toAnnot.entrySet()) {
                if (!overrideMetadata && (overrideMetadata || rh.getLitsOnPredSubj(document.getUri(), entry.getKey()).size() != 0 || rh.getRessOnPredSubj(document.getUri(), entry.getKey()).size() != 0)) continue;
                for (String val : entry.getValue()) {
                    ahe.createLitStat(document.getUri(), entry.getKey(), val);
                }
            }
            ahe.setNSPrefix("tika", baseUri);
            try {
                ahe.commit();
            }
            catch (Exception e) {
                logger.error((Object)"An error happened during the commiting of annotation changes. Remove the whole annotation.");
                logger.info((Object)("Failing metadata were: " + toAnnot));
                document.getAnnotation().remove(annot);
            }
        } else {
            logger.warn((Object)("No metadata extracted for document: " + document.getUri()));
        }
    }

    public static void extractTextAndMetadata(Document document, File file, Map<String, List<String>> toAnnot, boolean forceAutoDetectParser) throws UnexpectedException, ContentNotAvailableException {
        FileInputStream stream;
        AutoDetectParser parser;
        TikaConfig tikaConfig;
        try {
            tikaConfig = TikaExtractorService.getTikaConfig();
            logger.info((Object)"Custom tika configuration loaded successfully...");
        }
        catch (AccessDeniedException e) {
            logger.warn((Object)"Unable to load custom tika configuration, load default...");
            tikaConfig = TikaConfig.getDefaultConfig();
        }
        ResourceHelper h = RDFHelperFactory.getResourceHelper((Resource)document);
        List formatAnnots = h.getLitsOnPredSubj(document.getUri(), "http://purl.org/dc/elements/1.1/format");
        String mimeType = formatAnnots.size() > 0 && !forceAutoDetectParser ? (String)formatAnnots.get(0) : "NO_MIME_TYPE";
        logger.info((Object)("Mime type detected in Resource: " + mimeType));
        Map parserList = tikaConfig.getParsers();
        if (mimeType.equals("NO_MIME_TYPE") || !parserList.containsKey(mimeType)) {
            parser = new AutoDetectParser(tikaConfig);
        } else {
            MediaType type = MediaType.parse((String)mimeType);
            parser = tikaConfig.getParser(type);
        }
        Metadata metadata = new Metadata();
        try {
            stream = new FileInputStream(file);
        }
        catch (FileNotFoundException fnfe) {
            throw new ContentNotAvailableException("Unable to open stream on content file.", fnfe.getMessage(), (Throwable)fnfe);
        }
        String xhtmlFilePath = "";
        Map props = PropertiesLoader.loadProperties((String)"tika-service.properties");
        try {
            ParseContext context = new ParseContext();
            parser.parse((InputStream)stream, (ContentHandler)TikaExtractorService.getMediaUnitContentHandler((Document)document), metadata, context);
            TikaExtractorService.fillMapWithMetadata(toAnnot, (Metadata)metadata);
            ((InputStream)stream).close();
        }
        catch (IOException e) {
            logger.error((Object)"Document stream could not be read.", (Throwable)e);
            throw new UnexpectedException("Document stream could not be read.", e.getMessage(), (Throwable)e);
        }
        catch (SAXException e) {
            logger.error((Object)"SAX events could not be processed.", (Throwable)e);
            throw new UnexpectedException("Document stream could not be read.", e.getMessage(), (Throwable)e);
        }
        catch (TikaException e) {
            logger.error((Object)"Document could not be parsed.", (Throwable)e);
            throw new UnexpectedException("Document stream could not be read.", e.getMessage(), (Throwable)e);
        }
        TikaExtractorService.cleanMap(toAnnot);
    }

    private static boolean isHtml(TikaConfig tikaConfig, InputStream stream) throws IOException {
        MediaType type = tikaConfig.getMimeRepository().detect((InputStream)new BufferedInputStream(stream), new Metadata());
        logger.info((Object)("Mime Type detected in Resource: " + type.toString()));
        return type.toString().equals("text/html") || type.toString().equals("application/xhtml+xml") || type.toString().equals("application/vnd.wap.xhtml+xml") || type.toString().equals("application/x-asp");
    }

    private static MediaUnitContentHandler getMediaUnitContentHandler(Document document) throws TransformerFactoryConfigurationError {
        BodyContentHandler bodyHandler = new BodyContentHandler(-1);
        MediaUnitContentHandler mediaUnitHandler = new MediaUnitContentHandler((ContentHandler)bodyHandler, document);
        return mediaUnitHandler;
    }

    private static ContentHandler getXmlContentHandler(File xhtmlFile) throws TransformerConfigurationException {
        SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty("method", "xml");
        handler.getTransformer().setOutputProperty("indent", "yes");
        handler.setResult(new StreamResult(xhtmlFile));
        return handler;
    }

    protected static void cleanMap(Map<String, List<String>> toAnnot) {
        for (String datePred : DATE_PREDS) {
            if (!toAnnot.containsKey(datePred)) continue;
            ArrayList<String> cleanedDates = new ArrayList<String>();
            for (String date : toAnnot.get(datePred)) {
                String newDate = TikaExtractorService.convertToISO8601Date((String)date);
                cleanedDates.add(newDate);
            }
            toAnnot.put(datePred, cleanedDates);
        }
        HashSet<String> predToRemove = new HashSet<String>();
        for (Map.Entry<String, List<String>> entry : toAnnot.entrySet()) {
            ListIterator<String> listIt = entry.getValue().listIterator();
            while (listIt.hasNext()) {
                String val = listIt.next();
                listIt.set(XMLStringCleaner.getXMLRecommendedString((String)val));
                if (!val.trim().equals("")) continue;
                listIt.remove();
            }
            if (!entry.getValue().isEmpty()) continue;
            predToRemove.add(entry.getKey());
        }
        for (String keyToRemove : predToRemove) {
            toAnnot.remove(keyToRemove);
        }
    }

    protected static Map<String, List<String>> fillMapWithMetadata(Map<String, List<String>> toAnnot, Metadata metadata) {
        for (String name : metadata.names()) {
            List<String> list;
            ArrayList<String> values = new ArrayList<String>();
            if (metadata.isMultiValued(name)) {
                values.addAll(Arrays.asList(metadata.getValues(name)));
            } else if (metadata.get(name) != null) {
                values.add(metadata.get(name).trim());
            }
            if (name.equals("language")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/language");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/language", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("http://purl.org/dc/elements/1.1/publisher") || name.contains("http://purl.org/dc/elements/1.1/publisher")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/publisher");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/publisher", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("Author") || name.equals("creator")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/creator");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/creator", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("Content-Location") || name.equals("Location") || name.equals("source")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/source");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/source", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("Content-Type") || name.equals("format") || name.equals("mime.type.magic")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/format");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/format", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("contributor") || name.equals("Last-Author") || name.equals("Company") || name.equals("Manager")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/contibutor");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/contibutor", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals(DublinCore.DATE) || name.equals("Last-Printed")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/date");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/date", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals(HttpHeaders.LAST_MODIFIED) || name.equals("modified") || name.equals("Last-Save-Date")) {
                list = toAnnot.get("http://purl.org/dc/terms/modified");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/terms/modified", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("Character Count") || name.equals("Character-Count-With-Spaces") || name.equals("Page-Count") || name.equals("Word-Count") || name.equals("Paragraph-Count")) {
                List valuesWithUnit = name.equals("Character Count") ? TikaExtractorService.addUnitOnValues(values, (String)" characters") : (name.equals("Character-Count-With-Spaces") ? TikaExtractorService.addUnitOnValues(values, (String)" characters (with spaces)") : (name.equals("Page-Count") ? TikaExtractorService.addUnitOnValues(values, (String)" pages") : (name.equals("Word-Count") ? TikaExtractorService.addUnitOnValues(values, (String)" words") : (name.equals("Paragraph-Count") ? TikaExtractorService.addUnitOnValues(values, (String)" paragraphs") : new ArrayList()))));
                List<String> list2 = toAnnot.get("http://purl.org/dc/terms/extent");
                if (list2 == null) {
                    toAnnot.put("http://purl.org/dc/terms/extent", valuesWithUnit);
                    continue;
                }
                list2.addAll(valuesWithUnit);
                continue;
            }
            if (name.equals("Keywords") || name.equals("subject")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/subject");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/subject", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("License-Location") || name.equals("License-Url")) {
                list = toAnnot.get("http://purl.org/dc/terms/license");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/terms/license", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("description") || name.equals("Notes") || name.equals("Category")) {
                list = toAnnot.get("http://purl.org/dc/elements/1.1/description");
                if (list == null) {
                    toAnnot.put("http://purl.org/dc/elements/1.1/description", values);
                    continue;
                }
                list.addAll(values);
                continue;
            }
            if (name.equals("identifier")) {
                toAnnot.put("http://purl.org/dc/elements/1.1/identifier", values);
                continue;
            }
            if (name.equals("publisher")) {
                toAnnot.put("http://purl.org/dc/elements/1.1/publisher", values);
                continue;
            }
            if (name.equals("relation")) {
                toAnnot.put("http://purl.org/dc/elements/1.1/relation", values);
                continue;
            }
            if (name.equals("rights")) {
                toAnnot.put("http://purl.org/dc/elements/1.1/rights", values);
                continue;
            }
            if (name.equals("title")) {
                toAnnot.put("http://purl.org/dc/elements/1.1/title", values);
                continue;
            }
            if (name.equals("type")) {
                toAnnot.put("http://purl.org/dc/elements/1.1/type", values);
                continue;
            }
            if (name.equals(MSOffice.CREATION_DATE)) {
                toAnnot.put("http://purl.org/dc/terms/created", values);
                continue;
            }
            if (name.equals("coverage")) {
                toAnnot.put("http://purl.org/dc/elements/1.1/coverage", values);
                continue;
            }
            if (name.equals("Comments")) {
                toAnnot.put("http://www.w3.org/2000/01/rdf-schema#comment", values);
                continue;
            }
            String predicate = "";
            boolean skip = false;
            String cleanedName = "";
            try {
                cleanedName = name.replace('(', '_').replace(' ', '_').replace(')', '_').replace("N\u00b0", "N_").replace("n\u00b0", "n.").replace('$', '_').replace('/', '_').replace('\\', '_').replace('#', '_').replace('\'', '_').replace('.', '_').replace(',', '_').replace('?', '_').replace('!', '_').replace('@', '_');
                predicate = new URL(baseUri + cleanedName).toURI().toString();
            }
            catch (URISyntaxException urise) {
                logger.warn((Object)("Unable to transform the property '" + name + "' into a predicate (" + predicate + ")"), (Throwable)urise);
                skip = true;
            }
            catch (MalformedURLException murle) {
                logger.warn((Object)("Unable to transform the property '" + name + "' into a predicate (" + predicate + ")"), (Throwable)murle);
                skip = true;
            }
            if (skip) continue;
            toAnnot.put(predicate, new ArrayList(values));
        }
        return toAnnot;
    }

    protected static Document checkArgs(ProcessArgs args) throws InvalidParameterException {
        if (args == null) {
            throw new InvalidParameterException("Invalid parameter from tika service.", "ProcessArgs was null.");
        }
        Resource res = args.getResource();
        if (res == null) {
            throw new InvalidParameterException("Invalid parameter from tika service.", "Resource of ProcessArgs was null.");
        }
        if (!(res instanceof Document)) {
            throw new InvalidParameterException("Invalid parameter from tika service.", "Resource of ProcessArgs was not a ComposedUnit, but a " + res.getClass().getName() + ".");
        }
        return (Document)res;
    }

    protected static String convertToISO8601Date(String inDateStr) {
        String outDateStr = "";
        String tmpDateStr = inDateStr;
        if (tmpDateStr != null && !tmpDateStr.trim().equals("")) {
            SimpleDateFormat sdf = Character.isDigit((tmpDateStr = tmpDateStr.trim()).charAt(0)) ? new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH) : (!tmpDateStr.contains(",") ? new SimpleDateFormat("EEE MMM d hh:mm:ss z yyyy", Locale.ENGLISH) : new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH));
            Date date = null;
            try {
                date = sdf.parse(tmpDateStr);
            }
            catch (ParseException pe) {
                logger.warn((Object)("Unable to read date: '" + tmpDateStr + "'."), (Throwable)pe);
            }
            if (date != null) {
                SimpleDateFormat simpleDate = new SimpleDateFormat("yyyy-MM-dd");
                outDateStr = simpleDate.format(date);
            }
        }
        return outDateStr;
    }

    protected static List<String> addUnitOnValues(List<String> values, String unit) {
        ArrayList<String> result = new ArrayList<String>();
        for (String val : values) {
            result.add(val + unit);
        }
        return result;
    }

    protected static TikaConfig getTikaConfig() throws AccessDeniedException {
        TikaConfig tikaConfig;
        try {
            tikaConfig = new TikaConfig(new ClassPathResource("tika-config.xml").getFile());
        }
        catch (TikaException e) {
            throw new AccessDeniedException("Unable to load default Tika Config.", e.getMessage(), (Throwable)e);
        }
        catch (IOException e) {
            throw new AccessDeniedException("Unable to load default Tika Config.", e.getMessage(), (Throwable)e);
        }
        catch (SAXException e) {
            throw new AccessDeniedException("Unable to load default Tika Config.", e.getMessage(), (Throwable)e);
        }
        return tikaConfig;
    }

    protected void loadTikaServiceProps() {
        Map props = PropertiesLoader.loadProperties((String)"tika-service.properties");
        if (props.get("baseUri") != null && ((String)props.get("baseUri")).isEmpty()) {
            baseUri = (String)props.get("baseUri");
        }
        if (props.get("removeContent") != null && ((String)props.get("removeContent")).isEmpty()) {
            removeContent = Boolean.parseBoolean((String)props.get("removeContent"));
        }
        if (props.get("overrideMetadata") != null && ((String)props.get("overrideMetadata")).isEmpty()) {
            overrideMetadata = Boolean.parseBoolean((String)props.get("overrideMetadata"));
        }
    }

    static {
        ArrayList<String> tempList = new ArrayList<String>();
        tempList.add("http://purl.org/dc/terms/created");
        tempList.add("http://purl.org/dc/terms/date");
        tempList.add("http://purl.org/dc/terms/dateAccepted");
        tempList.add("http://purl.org/dc/terms/dateCopyrighted");
        tempList.add("http://purl.org/dc/terms/dateSubmitted");
        tempList.add("http://purl.org/dc/terms/issued");
        tempList.add("http://purl.org/dc/terms/modified");
        tempList.add("http://purl.org/dc/elements/1.1/date");
        DATE_PREDS = Collections.unmodifiableList(tempList);
    }
}

