/*
 * Decompiled with CFR 0.152.
 */
package org.wso2.extension.siddhi.execution.tokenizer;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.wso2.siddhi.annotation.Example;
import org.wso2.siddhi.annotation.Extension;
import org.wso2.siddhi.annotation.Parameter;
import org.wso2.siddhi.annotation.util.DataType;
import org.wso2.siddhi.core.config.SiddhiAppContext;
import org.wso2.siddhi.core.event.ComplexEvent;
import org.wso2.siddhi.core.event.ComplexEventChunk;
import org.wso2.siddhi.core.event.stream.StreamEvent;
import org.wso2.siddhi.core.event.stream.StreamEventCloner;
import org.wso2.siddhi.core.event.stream.populater.ComplexEventPopulater;
import org.wso2.siddhi.core.exception.SiddhiAppCreationException;
import org.wso2.siddhi.core.executor.ExpressionExecutor;
import org.wso2.siddhi.core.query.processor.Processor;
import org.wso2.siddhi.core.query.processor.stream.StreamProcessor;
import org.wso2.siddhi.core.util.config.ConfigReader;
import org.wso2.siddhi.query.api.definition.AbstractDefinition;
import org.wso2.siddhi.query.api.definition.Attribute;

@Extension(name="tokenize", namespace="text", description="This splits a string into words", parameters={@Parameter(name="text", description="The input text which should be split.", type={DataType.STRING})}, examples={@Example(syntax="define stream inputStream (text string);\n@info(name = 'query1')\nfrom inputStream#text:tokenize(text)\nselect text\ninsert into outputStream;", description="This query performs tokenization for the given string.")})
public class TweetTextTokenizer
extends StreamProcessor {
    private static final Logger log = Logger.getLogger(TweetTextTokenizer.class);
    private List<String> wordList = new ArrayList<String>();

    protected void process(ComplexEventChunk<StreamEvent> streamEventChunk, Processor nextProcessor, StreamEventCloner streamEventCloner, ComplexEventPopulater complexEventPopulater) {
        String urlPattern = "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
        String punctChars = "[\\s+'\u201c\u201d\u2018\u2019\\\".?!,:;&]";
        String brackets = "[<>\u00ab\u00bb{}\\(\\)\\[\\]]";
        String decorations = "[\u266b\u266a]+";
        String timeLike = "\\d+:\\d+";
        String numNum = "\\d+\\.\\d+";
        Pattern pattern = Pattern.compile(punctChars + "|" + brackets + "|" + timeLike + "|" + numNum + "|" + decorations);
        String regexPattern = urlPattern + "|@(.*)|#(.*)|[0-9]+|\u203c|\u2026";
        while (streamEventChunk.hasNext()) {
            String[] words;
            StreamEvent streamEvent = (StreamEvent)streamEventChunk.next();
            String event = (String)this.attributeExpressionExecutors[0].execute((ComplexEvent)streamEvent);
            event = this.removeEmojis(event);
            event = event.replaceAll(regexPattern, "");
            for (String word : words = pattern.split(event)) {
                if (word.equals("") || !this.isMeaningful(word)) continue;
                Object[] data = new Object[]{word};
                complexEventPopulater.populateComplexEvent((ComplexEvent)streamEvent, data);
                nextProcessor.process(streamEventChunk);
            }
        }
    }

    protected List<Attribute> init(AbstractDefinition inputDefinition, ExpressionExecutor[] attributeExpressionExecutors, ConfigReader configReader, SiddhiAppContext siddhiAppContext) {
        if (attributeExpressionExecutors.length == 1) {
            if (attributeExpressionExecutors[0].getReturnType() != Attribute.Type.STRING) {
                throw new SiddhiAppCreationException("Text should be of type string. But found " + attributeExpressionExecutors[0].getReturnType());
            }
        } else {
            throw new IllegalArgumentException("Invalid no of arguments passed to text:tokenize() function, required 1, but found " + attributeExpressionExecutors.length);
        }
        InputStream inputStream = TweetTextTokenizer.class.getResourceAsStream("/words.csv");
        try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8));){
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                this.wordList.add(line);
            }
        }
        catch (FileNotFoundException e) {
            log.error((Object)("File is not found : " + e.getMessage()));
        }
        catch (IOException e) {
            log.error((Object)("Error occurred while reading file : " + e.getMessage()));
        }
        ArrayList<Attribute> attributes = new ArrayList<Attribute>();
        attributes.add(new Attribute("token", Attribute.Type.STRING));
        return attributes;
    }

    public void start() {
    }

    public void stop() {
    }

    public Map<String, Object> currentState() {
        return null;
    }

    public void restoreState(Map<String, Object> state) {
    }

    private boolean isMeaningful(String word) {
        for (String words : this.wordList) {
            if (!words.equalsIgnoreCase(word)) continue;
            return false;
        }
        return true;
    }

    private String removeEmojis(String text) {
        Pattern unicodeOutliers = Pattern.compile("[\ud83c\udc00-\ud83c\udfff]|[\ud83d\udc00-\ud83d\udfff]|[\u2600-\u27ff]", 194);
        Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(text);
        text = unicodeOutlierMatcher.replaceAll("");
        return text;
    }
}

